Source code for clld.web.views.sitemap

"""
view callables implementing the sitemap protocol.

.. seealso:: http://www.sitemaps.org/
"""
import operator
import itertools

from pyramid.response import Response
from pyramid.httpexceptions import HTTPNotFound

from sqlalchemy import join, and_, true

from clld import RESOURCES
from clld.db.meta import DBSession
from clld.db.models import common
from clld.web.util.helpers import get_url_template


# http://www.sitemaps.org/protocol.html#index
LIMIT = 50000


[docs]def robots(req):
    """robots.txt response listing the sitemaps.

    .. seealso:: http://www.sitemaps.org/protocol.html#submit_robots
    """
    spec = ''
    deny = (req.registry.settings.get('clld.robots_deny') or '').strip().split()
    for robot in deny:
        spec = spec + 'User-agent: {}\nDisallow: /\n'.format(robot)
    return Response(
        "Sitemap: {}\n{}".format(req.route_url('sitemapindex'), spec), content_type="text/plain")


def _query(req, rsc):
    """Ordered sqlalchemy query.

    We must make sure, each query is ordered, so that limit and offset does make sense.
    """
    return DBSession.query(rsc.model.id, rsc.model.updated).order_by(rsc.model.pk)


def _e(name, *content):
    return '<{0}>{1}</{0}>'.format(name, ''.join(content))


def _response(type_, itemiter):
    def serialize(item):
        name = 'url' if type_ == 'urlset' else 'sitemap'
        return _e(name, *[_e(k, v) for k, v in item.items()])

    return Response(
        """\
<?xml version="1.0" encoding="UTF-8"?>
<{0} xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{1}
</{0}>""".format(type_, '\n'.join(map(serialize, itemiter))),
        content_type="application/xml")


[docs]def sitemapindex(req):
    """Response listing resource-specific sitemaps.

    .. seealso:: http://www.sitemaps.org/protocol.html#index
    """
    def _iter(sitemaps):
        for r in RESOURCES:
            if r.with_index and r.name in sitemaps:
                n, m = divmod(_query(req, r).count(), LIMIT)
                if m:
                    n += 1
                for i in range(n):
                    yield dict(loc=req.route_url('sitemap', rsc=r.name, n=i))

    return _response(
        'sitemapindex', _iter(req.registry.settings.get('clld.sitemaps', [])))


[docs]def sitemap(req):
    """Resource-specific sitemap.

    .. note:: The resource is looked up using the URL parameter ``rsc``.

    .. seealso:: http://www.sitemaps.org/protocol.html#xmlTagDefinitions
    """
    def _iter():
        for r in RESOURCES:
            if r.name == req.matchdict['rsc']:
                query = _query(req, r)\
                    .offset(LIMIT * int(req.matchdict['n']))\
                    .limit(LIMIT)
                for id_, updated in query:
                    yield dict(
                        loc=req.route_url(r.name, id=id_),
                        lastmod=str(updated).split(' ')[0])
    return _response('urlset', _iter())


[docs]def resourcemap(req):
    """Resource-specific JSON response listing all resource instances."""
    rsc = req.params.get('rsc')
    if rsc == 'language':
        q = DBSession.query(
            common.Language.id,
            common.Language.name,
            common.Language.latitude,
            common.Language.longitude,
            common.Identifier.type.label('itype'),
            common.Identifier.name.label('iname')
        ).select_from(common.Language).outerjoin(join(
            common.LanguageIdentifier,
            common.Identifier, and_(
                common.LanguageIdentifier.identifier_pk == common.Identifier.pk,
                common.Identifier.type != 'name')
        )).filter(common.Language.active == true()).order_by(common.Language.id)

        def resources():
            for (id, name, lat, lon), rows in itertools.groupby(q, operator.itemgetter(0, 1, 2, 3)):
                identifiers = [
                    {'type': r.itype, 'identifier': r.iname.lower()
                     if r.itype.startswith('WALS') else r.iname}
                    for r in rows if r.iname is not None]
                yield {'id': id, 'name': name, 'latitude': lat, 'longitude': lon,
                       'identifiers': identifiers}
    elif rsc == 'parameter':
        q = DBSession.query(
            common.Parameter.id,
            common.Parameter.name
        ).order_by(common.Parameter.pk)

        def resources():
            for id, name in q:
                yield {'id': id, 'name': name}
    else:
        return HTTPNotFound()

    return {
        'properties': {
            'dataset': req.dataset.id,
            'uri_template': get_url_template(req, rsc, relative=False)},
        'resources': list(resources())}