Source code for clld.web.views.olac

"""
Support for the provider implementation of an OLAC OAI-PMH repository.

.. seealso:: http://www.language-archives.org/OLAC/repositories.html
"""
import re
from datetime import datetime, timedelta
from copy import copy
from collections import namedtuple

from pyramid.renderers import render
from pyramid.response import Response
from sqlalchemy.orm import joinedload_all, undefer
from clldutils.misc import UnicodeMixin

from clld.db.models.common import Language, LanguageIdentifier, Identifier, IdentifierType
from clld.interfaces import IOlacConfig


#
# OAI-PMH protocol specifics:
#
VERBS = [
    'GetRecord',
    'Identify',
    'ListIdentifiers',
    'ListMetadataFormats',
    'ListRecords',
    'ListSets',
]

ERRORS = {
    'badArgument': 'The request includes illegal arguments, is missing required '
    'arguments, includes a repeated argument, or values for arguments have an illegal '
    'syntax.',
    'badResumptionToken': 'The value of the resumptionToken argument is invalid or '
    'expired.',
    'badVerb': 'Value of the verb argument is not a legal OAI-PMH verb, the verb '
    'argument is missing, or the verb argument is repeated.',
    'cannotDisseminateFormat': 'The metadata format identified by the value given for '
    'the metadataPrefix argument is not supported by the item or by the repository.',
    'idDoesNotExist': 'The value of the identifier argument is unknown or illegal in '
    'this repository.',
    'noRecordsMatch': 'The combination of the values of the from, until, set and '
    'metadataPrefix arguments results in an empty list.',
    'noMetadataFormats': 'There are no metadata formats available for the specified '
    'item.',
    'noSetHierarchy': 'The repository does not support sets.'
}

MD_PREFIX = 'olac'

TIMESTAMP_REGEX = '[0-9]{4}\-[0-9]{2}\-[0-9]{2}'
TIMESTAMP_PATTERN = re.compile(TIMESTAMP_REGEX + '$')

Participant = namedtuple('Participant', 'role name email')
Institution = namedtuple('Institution', 'name url location')


def timestamp(dt=None):
    return str(dt or datetime.utcnow()).split('.')[0].replace(' ', 'T') + 'Z'


def date(dt=None):
    return str(dt or datetime.utcnow()).split(' ')[0]


[docs]class ResumptionToken(UnicodeMixin): """Represents an OAI-PMH resumption token. We encode all information from a List query in the resumption token so that we do not actually have to keep track of sequences of requests (in the spirit of REST). .. seealso: http://www.openarchives.org/OAI/openarchivesprotocol.html#FlowControl """ PATTERN = re.compile('(?P<offset>[0-9]+)(?P<from>f%s)?(?P<until>u%s)?$' % (TIMESTAMP_REGEX, TIMESTAMP_REGEX)) limit = 100 def __init__(self, url_arg=None, offset=None, from_=None, until=None): datetime_from_iso = lambda s: datetime(*map(int, s.split('-'))) self.offset = offset or 0 self.from_ = from_ self.until = until if url_arg is not None: m = self.PATTERN.match(url_arg) assert m self.offset = int(m.group('offset')) assert self.offset % self.limit == 0 if m.group('from'): self.from_ = datetime_from_iso(m.group('from')[1:]) if m.group('until'): self.until = datetime_from_iso(m.group('until')[1:]) + timedelta(1) def __unicode__(self): res = "%s" % self.offset if self.from_: res += "f%s" % date(self.from_) if self.until: res += "u%s" % date(self.until) assert self.PATTERN.match(res) return res
[docs]class OlacConfig(object): """Configuration of an applications OLAC repository.""" scheme = 'oai' delimiter = ':' def _query(self, req): subquery = req.db.query(Identifier)\ .filter_by(type=IdentifierType.iso.value)\ .join(LanguageIdentifier)\ .filter_by(language_pk=Language.pk) return req.db.query(Language).filter(subquery.exists())\ .options(undefer('updated'), joinedload_all( Language.languageidentifier, LanguageIdentifier.identifier)) def get_earliest_record(self, req): return self._query(req).order_by(Language.updated, Language.pk).first() def get_record(self, req, identifier): rec = Language.get(self.parse_identifier(req, identifier), default=None) assert rec return rec def query_records(self, req, from_=None, until=None): q = self._query(req).order_by(Language.pk) if from_: q = q.filter(Language.updated >= from_) if until: q = q.filter(Language.updated < until) return q def format_identifier(self, req, item): return self.delimiter.join([self.scheme, req.dataset.domain, item.id]) def parse_identifier(self, req, id_): assert self.delimiter in id_ return id_.split(self.delimiter)[-1]
[docs] def admin(self, req): """Configure the archive participant with role admin. Note: According to http://www.language-archives.org/OLAC/repositories.html the list of participants > must include the system administrator whose email address is given in the > <oai:adminEmail> element of the Identify response. :param req: The current request. :return: A suitable `Participant` instance or None. """ return Participant("Admin", "Archive Admin", req.dataset.contact)
def description(self, req): # Note: According to http://www.language-archives.org/OLAC/repositories.html the # list of participants # > must include the system administrator whose email address is given in the # > <oai:adminEmail> element of the Identify response. participants = [self.admin(req)] for ed in req.dataset.editors: participants.append(Participant( "Editor", ed.contributor.name, ed.contributor.email or req.dataset.contact)) return { 'archiveURL': 'http://%s/' % req.dataset.domain, 'participants': participants, 'institution': Institution( req.dataset.publisher_name, req.dataset.publisher_url, req.dataset.publisher_place, ), 'synopsis': req.dataset.description or '', }
[docs]def olac(req): """View implementing the OLAC OAI-PMH repository protocol.""" return olac_with_cfg(req, req.registry.getUtility(IOlacConfig))
[docs]def olac_with_cfg(req, cfg): """Factory function for olac views with different configurations. If applications want to disseminate metadata for other resources than languages this function can be used to provide a second olac repository. """ res = dict(verb=None, error=None, response_date=timestamp(), params={}, date=date) res['cfg'] = cfg def response(res): return Response( render('olac.mako', res, request=req).encode('utf8'), content_type='text/xml', charset=None) def error(_error): if _error: res['error'] = (_error, ERRORS[_error]) return response(res) args = dict(req.params.items()) res['params'] = copy(args) res['verb'] = args.pop('verb', None) if res['verb'] not in VERBS: return error("badVerb") if res['verb'] == 'ListSets': return error("noSetHierarchy") if res['verb'] == 'Identify': if args: return error("badArgument") res['earliest'] = res['cfg'].get_earliest_record(req) if res['verb'] == 'GetRecord': if sorted(args.keys()) != ['identifier', 'metadataPrefix']: return error("badArgument") if args['metadataPrefix'] != MD_PREFIX: return error("cannotDisseminateFormat") try: # # # res['language'] = res['cfg'].get_record(req, args['identifier']) except AssertionError: return error("idDoesNotExist") if res['verb'] in ['ListIdentifiers', 'ListRecords']: if (('metadataPrefix' not in args and 'resumptionToken' not in args) or ('from' in args and not TIMESTAMP_PATTERN.match(args['from'])) or ('until' in args and not TIMESTAMP_PATTERN.match(args['until']))): return error("badArgument") if [arg for arg in args if arg not in [ 'from', 'until', 'metadataPrefix', 'set', 'resumptionToken']]: return error("badArgument") if 'set' in args: return error("noSetHierarchy") if 'resumptionToken' in args: if len(args) > 1: return error("badArgument") try: rt = ResumptionToken(url_arg=args['resumptionToken']) except AssertionError: return error("badResumptionToken") else: rt = ResumptionToken( None, 0, args.get('from', None), args.get('until', None)) q = res['cfg'].query_records(req, from_=rt.from_, until=rt.until) res['languages'] = q.offset(rt.offset).limit(rt.limit).all() if not res['languages']: return error('noRecordsMatch') if len(res['languages']) < rt.limit: res['resumptionToken'] = None else: rt.offset += rt.limit res['resumptionToken'] = rt if res['verb'] == 'ListMetadataFormats': if args and 'identifier' not in args: return error("badArgument") return response(res)