Source code for clld.scripts.util

import os
import sys
from distutils.util import strtobool
from collections import defaultdict
import argparse
from urllib import quote_plus
import json
import logging
from functools import partial

import transaction
from sqlalchemy import engine_from_config, create_engine
from sqlalchemy.orm import joinedload
from path import path
from pyramid.paster import get_appsettings, setup_logging, bootstrap
import requests

from clld.db.meta import VersionedDBSession, DBSession, Base
from clld.db.models import common
from clld.util import slug
from clld.interfaces import IDownload
from clld.lib import bibtex


def glottocodes_by_isocode(dburi, cols=['id']):
    """Query Glottolog

    :dburi: If not None, sqlalchemy dburi for a glottolog database. If None, \
    glottolog.org will be queried.
    :cols: list of column/attribute names for which information should be gathered.
    :return: dict mapping iso639-3 codes to glottolog data.
    """
    glottocodes = {}
    if dburi:
        select = ', '.join('l.%s' % name for name in cols)
        glottolog = create_engine(dburi)
        for row in glottolog.execute(
            'select ll.hid, %s from language as l, languoid as ll where l.pk = ll.pk'
                        % select
        ):
            glottocodes[row[0]] = row[1] if len(row) == 2 else row[1:]
    else:
        conv = defaultdict(lambda: lambda x: x, latitude=float, longitude=float)
        res = requests.get("http://glottolog.org/resourcemap.json?rsc=language")
        for rsc in res.json()['resources']:
            for id_ in rsc.get('identifiers', []):
                if id_['type'] == 'iso639-3':
                    row = [conv[col](rsc.get(col)) if rsc.get(col) is not None else None
                           for col in cols]
                    glottocodes[id_['identifier']] = row[0] if len(row) == 1 \
                        else tuple(row)
                    break
    return glottocodes


def add_language_codes(data, lang, isocode, glottocodes=None):
    def identifier(type_, id_):
        return data.add(
            common.Identifier, '%s:%s' % (type_, id_),
            id='%s:%s' % (type_, id_),
            name=id_,
            type=getattr(common.IdentifierType, type_).value)

    if isocode and len(isocode) == 3:
        DBSession.add(common.LanguageIdentifier(
            language=lang, identifier=identifier('iso', isocode)))

        if glottocodes and isocode in glottocodes:
            DBSession.add(common.LanguageIdentifier(
                language=lang, identifier=identifier('glottolog', glottocodes[isocode])))


def bibtex2source(rec):
    year = bibtex.unescape(rec.get('year', 'nd'))
    fields = {}
    jsondata = {}
    for field in bibtex.FIELDS:
        if field in rec:
            value = bibtex.unescape(rec[field])
            container = fields if hasattr(common.Source, field) else jsondata
            container[field] = value

    return common.Source(
        id=slug(rec.id),
        name=('%s %s' % (bibtex.unescape(
            rec.get('author', rec.get('editor', ''))), year)).strip(),
        description=bibtex.unescape(rec.get('title', rec.get('booktitle', ''))),
        jsondata=jsondata,
        bibtex_type=rec.genre,
        **fields)


def confirm(question, default=False):  # pragma: no cover
    """Ask a yes/no question via raw_input() and return their answer.

    "question" is a string that is presented to the user.
    """
    while True:
        sys.stdout.write(question + (" [Y|n] " if default else " [y|N] "))
        choice = raw_input().lower()
        if not choice:
            return default
        try:
            return strtobool(choice)
        except ValueError:
            sys.stdout.write(
                "Please respond with 'yes' or 'no' (or 'y' or 'n').\n")


def data_file(module, *comps):
    """
    >>> assert data_file(common)
    """
    return path(module.__file__).dirname().joinpath('..', 'data', *comps)


def setup_session(config_uri, engine=None):
    setup_logging(config_uri)
    settings = get_appsettings(config_uri)
    engine = engine or engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    VersionedDBSession.configure(bind=engine)
    Base.metadata.create_all(engine)
    return path(config_uri.split('#')[0]).abspath().dirname().basename()


class ExistingDir(argparse.Action):  # pragma: no cover
    def __call__(self, parser, namespace, values, option_string=None):
        path_ = path(values)
        if not path_.exists():
            raise argparse.ArgumentError(self, 'path does not exist')
        if not path_.isdir():
            raise argparse.ArgumentError(self, 'path is no directory')
        setattr(namespace, self.dest, path_)


class ExistingConfig(argparse.Action):  # pragma: no cover
    def __call__(self, parser, namespace, values, option_string=None):
        path_ = path(values.split('#')[0])
        if not path_.exists():
            raise argparse.ArgumentError(self, 'file does not exist')
        setattr(namespace, self.dest, values)


class SqliteDb(argparse.Action):  # pragma: no cover
    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, 'engine', create_engine('sqlite:///%s' % values[0]))


def index(rsc, req, solr, query_options=None, batch_size=1000):
    query = DBSession.query(rsc).order_by(rsc.pk)
    if query_options:
        query = query.options(*query_options)

    for i in range(0, query.count(), batch_size):
        res = solr.update(
            [p.__solr__(req) for p in query.limit(batch_size).offset(i)],
            'json',
            commit=True)
        if res.status != 200:
            print res.raw_content  # pragma: no cover


def parsed_args(*arg_specs, **kw):  # pragma: no cover
    """pass a truthy value as keyword parameter bootstrap to bootstrap the app.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "config_uri", action=ExistingConfig, help="ini file providing app config")
    parser.add_argument("--glottolog-dburi", default=None)
    parser.add_argument("--module", default=None)
    parser.add_argument(
        "--sqlite", nargs=1, action=SqliteDb, help="sqlite db file")
    for args, _kw in arg_specs:
        parser.add_argument(*args, **_kw)
    args = parser.parse_args(args=kw.pop('args', None))
    engine = getattr(args, 'engine', kw.get('engine', None))
    args.env = bootstrap(args.config_uri) if kw.get('bootstrap', False) else {}
    module = setup_session(args.config_uri, engine=engine)

    # make sure we create URLs in the correct domain
    if args.env:
        dataset = DBSession.query(common.Dataset).first()
        if dataset:
            args.env['request'].environ['HTTP_HOST'] = dataset.domain

    if module == 'tests':
        module = 'clld'
    args.module = __import__(args.module or module)
    args.log = logging.getLogger(args.module.__name__)
    if engine:
        args.log.info('using bind %s' % engine)
    args.data_file = partial(data_file, args.module)
    args.module_dir = path(args.module.__file__).dirname()
    args.migrations_dir = path(
        args.module.__file__).dirname().joinpath('..', 'migrations')
    return args


def initializedb(create=None, prime_cache=None, **kw):  # pragma: no cover
    args = parsed_args((("--prime-cache-only",), dict(action="store_true")), **kw)
    if not args.prime_cache_only:
        if create:
            with transaction.manager:
                create(args)
    if prime_cache:
        with transaction.manager:
            prime_cache(args)


def create_downloads(**kw):  # pragma: no cover
    args = parsed_args(bootstrap=True)
    for name, download in args.env['registry'].getUtilitiesFor(IDownload):
        args.log.info('creating download %s' % name)
        download.create(args.env['request'])


def gbs(**kw):  # pragma: no cover
    add_args = [
        (("command",), dict(help="download|verify|update")),
        (("--api-key",), dict(default=kw.get('key', os.environ.get('GBS_API_KEY')))),
    ]

    args = parsed_args(*add_args, **kw)
    if args.command == 'download' and not args.api_key:
        raise argparse.ArgumentError(None, 'no API key found for download')

    with transaction.manager:
        gbs_func(args.command, args, kw.get('sources'))


def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(sources):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                with open(filepath) as fp:
                    try:
                        data = json.load(fp)
                    except ValueError:
                        log.warn('no JSON object found in: %s' % filepath)
                        continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or unicode('')):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(list(words(stitle))))
                log.info('%s' % sorted(list(iwords)))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    with open(filepath, 'w') as fp:
                        json.dump({"totalItems": 0}, fp)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(filepath, 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)


[docs]class Data(defaultdict):
    """Dictionary, serving to store references to new db objects during data imports.

    The values are dictionaries, keyed by the name of the mapper class used to create the
    new objects.

    >>> d = Data()
    >>> assert d['k'] == {}
    """
    def __init__(self, **kw):
        super(Data, self).__init__(dict)
        self.defaults = kw

    def add(self, model, key, **kw):
        if '.' in kw.get('id', ''):
            raise ValueError('Object id contains illegal character "."')
        if kw.keys() == ['_obj']:
            # if a single keyword parameter _obj is passed, we take it to be the object
            # which should be added to the session.
            new = kw['_obj']
        else:
            for k, v in self.defaults.items():
                kw.setdefault(k, v)
            new = model(**kw)
        self[model.mapper_name()][key] = new
        DBSession.add(new)
        return new