Source code for clld.scripts.util

"""Shared functionality for clld console scripts."""
from __future__ import unicode_literals, division, absolute_import, print_function
import sys
from distutils.util import strtobool
from collections import defaultdict
import argparse
import logging
from functools import partial

from six.moves.urllib.parse import quote_plus
from six.moves import input
import transaction
from sqlalchemy import engine_from_config, create_engine
from sqlalchemy.orm import joinedload
from pyramid.paster import get_appsettings, setup_logging, bootstrap
import requests
from nameparser import HumanName
from clldutils.path import Path, as_posix, remove
from clldutils import jsonlib
from clldutils.misc import slug

from clld.db.meta import VersionedDBSession, DBSession, Base
from clld.db.models import common
from clld.db.util import page_query
from clld.lib import bibtex


def glottocodes_by_isocode(dburi, cols=['id']):
    """Query Glottolog.

    :dburi: If not None, sqlalchemy dburi for a glottolog database. If None, \
    glottolog.org will be queried.
    :cols: list of column/attribute names for which information should be gathered.
    :return: dict mapping iso639-3 codes to glottolog data.
    """
    glottocodes = {}
    if dburi:
        select = ', '.join('l.%s' % name for name in cols)
        glottolog = create_engine(dburi)
        for row in glottolog.execute(
            'select ll.hid, %s from language as l, languoid as ll where l.pk = ll.pk'
            % select
        ):
            if row[0]:
                glottocodes[row[0]] = row[1] if len(row) == 2 else row[1:]
    else:
        conv = defaultdict(lambda: lambda x: x, latitude=float, longitude=float)
        res = requests.get("http://glottolog.org/resourcemap.json?rsc=language")
        for rsc in res.json()['resources']:
            for id_ in rsc.get('identifiers', []):
                if id_['type'] == 'iso639-3':
                    row = [conv[col](rsc.get(col)) if rsc.get(col) is not None else None
                           for col in cols]
                    glottocodes[id_['identifier']] = row[0] if len(row) == 1 \
                        else tuple(row)
                    break
    return glottocodes


def add_language_codes(data, lang, isocode, glottocodes=None, glottocode=None):
    def identifier(type_, id_):
        return data.add(
            common.Identifier, '%s:%s' % (type_, id_),
            id='%s:%s' % (type_, id_),
            name=id_,
            type=getattr(common.IdentifierType, type_).value)

    if isocode and len(isocode) == 3:
        DBSession.add(common.LanguageIdentifier(
            language=lang, identifier=identifier('iso', isocode)))

    if glottocode or (glottocodes and isocode and isocode in glottocodes):
        glottocode = glottocode or glottocodes[isocode]
        DBSession.add(common.LanguageIdentifier(
            language=lang, identifier=identifier('glottolog', glottocode)))


def bibtex2source(rec, cls=common.Source):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(cls, field) else jsondata
            container[field] = value

    etal = ''
    eds = ''
    authors = rec.get('author')
    if not authors:
        authors = rec.get('editor', '')
        if authors:
            eds = ' (eds.)'
    if authors:
        authors = bibtex.unescape(authors).split(' and ')
        if len(authors) > 2:
            authors = authors[:1]
            etal = ' et al.'

        authors = [HumanName(a) for a in authors]
        authors = [n.last or n.first for n in authors]
        authors = '%s%s%s' % (' and '.join(authors), etal, eds)

    return cls(
        id=slug(rec.id),
        name=('%s %s' % (authors, year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)


def confirm(question, default=False):  # pragma: no cover
    """Ask a yes/no question via input() and return their answer.

    "question" is a string that is presented to the user.
    """
    while True:
        sys.stdout.write(question + (" [Y|n] " if default else " [y|N] "))
        choice = input().lower()
        if not choice:
            return default
        try:
            return strtobool(choice)
        except ValueError:
            sys.stdout.write(
                "Please respond with 'yes' or 'no' (or 'y' or 'n').\n")


def data_file(module, *comps):
    """Return Path object of file in the data directory of an app."""
    return Path(module.__file__).parent.joinpath('..', 'data', *comps)


def setup_session(config_uri, engine=None):
    setup_logging(config_uri)
    settings = get_appsettings(config_uri)
    engine = engine or engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    VersionedDBSession.configure(bind=engine)
    Base.metadata.create_all(engine)
    return Path(config_uri.split('#')[0]).resolve().parent.name


class ExistingDir(argparse.Action):  # pragma: no cover

    """Action to select an existing directory."""

    def __call__(self, parser, namespace, values, option_string=None):
        path_ = Path(values)
        if not path_.exists():
            raise argparse.ArgumentError(self, 'path does not exist')
        if not path_.is_dir():
            raise argparse.ArgumentError(self, 'path is no directory')
        setattr(namespace, self.dest, path_)


class ExistingConfig(argparse.Action):  # pragma: no cover

    """Action to select an existing config file."""

    def __call__(self, parser, namespace, values, option_string=None):
        path_ = Path(values.split('#')[0])
        if not path_.exists():
            raise argparse.ArgumentError(self, 'file does not exist')
        setattr(namespace, self.dest, values)


class SqliteDb(argparse.Action):  # pragma: no cover

    """Action to select an sqlite db to connect to."""

    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, 'engine', create_engine('sqlite:///%s' % values[0]))


def index(rsc, req, solr, query_options=None, batch_size=1000):
    query = DBSession.query(rsc).order_by(rsc.pk)
    if query_options:
        query = query.options(*query_options)

    for i in range(0, query.count(), batch_size):
        res = solr.update(
            [p.__solr__(req) for p in query.limit(batch_size).offset(i)],
            'json',
            commit=True)
        if res.status != 200:
            print(res.raw_content)  # pragma: no cover


def parsed_args(*arg_specs, **kw):  # pragma: no cover
    """pass a truthy value as keyword parameter bootstrap to bootstrap the app."""
    parser = argparse.ArgumentParser(description=kw.pop('description', None))
    parser.add_argument(
        "config_uri", action=ExistingConfig, help="ini file providing app config")
    parser.add_argument("--glottolog-dburi", default=None)
    parser.add_argument("--module", default=None)
    parser.add_argument(
        "--sqlite", nargs=1, action=SqliteDb, help="sqlite db file")
    for args, _kw in arg_specs:
        parser.add_argument(*args, **_kw)
    args = parser.parse_args(args=kw.pop('args', None))
    engine = getattr(args, 'engine', kw.get('engine', None))
    args.env = bootstrap(args.config_uri) if kw.get('bootstrap', False) else {}
    module = setup_session(args.config_uri, engine=engine)

    # make sure we create URLs in the correct domain
    if args.env:
        dataset = DBSession.query(common.Dataset).first()
        if dataset:
            args.env['request'].environ['HTTP_HOST'] = dataset.domain

    if module == 'tests':
        module = 'clld'
    args.module = __import__(args.module or module)
    args.log = logging.getLogger(args.module.__name__)
    if engine:
        args.log.info('using bind %s' % engine)
    args.data_file = partial(data_file, args.module)
    args.module_dir = Path(args.module.__file__).parent
    args.migrations_dir = Path(args.module.__file__).parent.joinpath('..', 'migrations')
    return args


def initializedb(*args, **kw):  # pragma: no cover
    create = kw.pop('create', None)
    prime_cache = kw.pop('prime_cache', None)
    args = list(args) + [(("--prime-cache-only",), dict(action="store_true"))]
    args = parsed_args(*args, **kw)
    if not args.prime_cache_only:
        if create:
            with transaction.manager:
                create(args)
    if prime_cache:
        with transaction.manager:
            prime_cache(args)


def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)


[docs]class Data(defaultdict): """Dictionary, serving to store references to new db objects during data imports. The values are dictionaries, keyed by the name of the model class used to create the new objects. >>> data = Data() >>> l = data.add(common.Language, 'l', id='abc', name='Abc Language') >>> assert l == data['Language']['l'] """ def __init__(self, **kw): super(Data, self).__init__(dict) self.defaults = kw
[docs] def add(self, model, key, **kw): """ Create an instance of a model class to be persisted in the database. :param model: The model class we want to create an instance of. :param key: A key which can be used to retrieve the instance later. :param kw: Keyword parameters passed to model class for initialisation. :return: The newly created instance of model class. """ if '.' in kw.get('id', ''): raise ValueError('Object id contains illegal character "."') if list(kw.keys()) == ['_obj']: # if a single keyword parameter _obj is passed, we take it to be the object # which should be added to the session. new = kw['_obj'] else: for k, v in self.defaults.items(): kw.setdefault(k, v) new = model(**kw) self[model.__name__][key] = new DBSession.add(new) return new