# coding: utf8
"""
.. seealso:: http://ocoins.info/
"""
# note: don't import unicode_literals here!
import re
from six import PY3
if PY3: # pragma: no cover
from urllib.parse import urlencode
else:
from urllib import urlencode
from clld.util import UnicodeMixin
FIELDS = {
'any': {
'rft_id': None,
'aulast': None, # First author's family name. This may be more than one word.
# In many citations, the author's family name is recorded first and is followed
# by a comma, i.e. Smith, Fred James is recorded as "aulast=smith"
'aufirst': None, # First author's given name or names or initials. This data
# element may contain multiple words and punctuation, i.e. "Fred F", "Fred James"
'auinit': None, # First author's first and middle initials.
'auinit1': None, # First author's first initial.
'auinitm': None, # First author's middle initial.
'ausuffix': None, # First author's name suffix. Qualifiers on an author's name
# such as "Jr.", "III" are entered here. i.e. Smith, Fred Jr. is recorded as
# "ausuffux=jr"
'au': None, # This data element contains the full name of a single author, i. e.
# "Smith, Fred M", "Harry S. Truman". (au is repeatable)
'aucorp': None, # Organization or corporation that is the author or creator of
# the book, i.e. "Mellon Foundation"
'title': None, # Book title. Provided for compatibility with version 0.1.
# Prefer btitle.
'place': None, # Place of publication. "New York"
'pub': None, # Publisher name. "Harper and Row"
'date': None, # Date of publication. Book dates are assumed to be a single year.
},
'dissertation': {
'inst': None,
'degree': None,
},
'book': {
'btitle': None, # The title of the book. This can also be expressed as title, for
# compatibility with version 0.1. "moby dick or the white whale"
'isbn': None, # International Standard Book Number (ISBN). The ISBN is usually
# presented as 9 digits plus a final check digit (which may be "X"), i.e.
# "057117678X" but it may contain hyphens, i.e. "1-878067-73-7"
'atitle': None, # Chapter title. Chapter title is included if it is a distinct
# title, i.e. "The Push Westward."
'edition': None, # Statement of the edition of the book. This will usually be a
# phrase, with or without numbers, but may be a single number. I.e.
# "First edition", "4th ed."
'tpages': None, # Total pages. Total pages is the largest recorded number of
# pages, if this can be determined. I.e., "ix, 392 p." would be recorded as "392"
# in tpages. This data element is usually available only for monographs (books and
# printed reports). In some cases, tpages may not be numeric, i.e. "F36"
'series': None, # The title of a series in which the book or document was issued.
# There may also be an ISSN associated with the series.
'spage': None, # First page number of a start/end (spage-epage) pair.
# Note that pages are not always numeric.
'epage': None, # Second (ending) page number of a start/end (spage-epage) pair.
'pages': None, # Start and end pages for parts of a book, i.e. "124-147". This
# can also be used for an unstructured pagination statement when data relating to
# pagination cannot be interpreted as a start-end pair, i.e. "A7, C4-9", "1-3,6".
# This data element includes the OpenURL 0.1 definition of "pages".
'issn': None, # International Standard Serials Number (ISSN). The issn may
# contain a hyphen, i.e. "1041-5653". An ISSN in the book format is often
# associated with a series title.
'genre': lambda val: val if val in [
'book', # a publication that is complete in one part or a designated finite
# number of parts, often identified with an ISBN.
'bookitem', # a defined section of a book, usually with a separate title or
# number.
'proceeding', # a conference paper or proceeding published in a conference
# publication.
'conference', # a publication bundling the proceedings of a conference.
'report', # report or technical report is a published document that is
# issued by an organization, agency or government body.
'document', # general document type to be used when available data elements
# do not allow determination of a more specific document type, i.e. when one
# has only author and title but no publication information.
'unknown', # use when the genre of the document is unknown.
] else None,
},
'journal': {
'rft_id': None,
'atitle': None, # Article title.
'jtitle': None, # Journal title. Use the most complete title available.
# Abbreviated titles, when known, are records in stitle. This can also be
# expressed as title, for compatibility with version 0.1. "journal of the
# american medical association"
'stitle': None, # Abbreviated or short journal title. This is used for journal
# title abbreviations, where known, i.e. "J Am Med Assn"
'volume': None, # Volume designation.Volume is usually expressed as a number but
# could be roman numerals or non-numeric, i.e. "124", or "VI".
'issue': None, # This is the designation of the published issue of a journal,
# corresponding to the actual physical piece in most cases. While usually numeric,
# it could be non-numeric. Note that some publications use chronology in the place
# of enumeration, i.e. Spring, 1998.
'spage': None, # First page number of a start/end (spage-epage) pair. Note that
# pages are not always numeric.
'epage': None, # Second (ending) page number of a start/end (spage-epage) pair.
'pages': None, # Start and end pages, i.e. "53-58". This can also be used for an
# unstructured pagination statement when data relating to pagination cannot be
# interpreted as a start-end pair, i.e. "A7, C4-9", "1-3,6". This data element
# includes the OpenURL 0.1 definition of "pages".
'artnum': None, # Article number assigned by the publisher. Article numbers are
# often generated for publications that do not have usable pagination, in
# particular electronic journal articles, i.e. "unifi000000090". A URL may be the
# only usable identifier for an online article, in which case the URL can be
# treated as an identifier for the article (i.e.
# "rft_id=http://www.firstmonday.org/ issues/issue6_2/odlyzko/ index.html").
'issn': None, # International Standard Serials Number (ISSN). The issn may
# contain a hyphen, i.e. "1041-5653"
'eissn': None, # ISSN for electronic version of the journal. Although there is no
# distinction by format in the assignment of ISSNs, some bibliographic services
# now carry both the ISSN for the paper version and a separate ISSN for the
# electronic version. This data element is included here to allow the OpenURL to
# carry both ISSNs and distinguish them.
'isbn': None, # International Standard Book Number (ISBN). The ISBN is usually
# presented as 9 digits plus a final check digit (which may be "X"), i.e.
# "057117678X" but it may contain hyphens, i.e. "1-878067-73-7"
'coden': None, # CODEN
'sici': None, # Serial Item and Contribution Identifier (SICI)
'genre': lambda val: val if val in [
'issue', # one instance of the serial publication.
'article', # a document published in a journal.
'proceeding', # a single conference presentation published in a journal or
# serial publication
'conference', # a record of a conference that includes one or more conference
# papers and that is published as an issue of a journal or serial publication
'preprint', # an individual paper or report published in paper or
# electronically prior to its publication in a journal or serial.
'unknown', # use when the genre of the document is unknown.
] else None,
'chron': None, # Enumeration or chronology in not-normalized form, i.e.
# "1st quarter". Where numeric dates are also available, place the numeric portion
# in the "date" Key. So a recorded date of publication of "1st quarter 1992"
# becomes date=1992&chron=1st quarter. Normalized indications of chronology can be
# provided in the ssn and quarter Keys.
'ssn': lambda val: val if val in ['spring', 'summer', 'fall', 'winter'] else None,
# Season (chronology). Legitimate values are spring, summer, fall, winter
'quarter': lambda val: val if int(val) in [1, 2, 3, 4] else None,
# Quarter (chronology).
'part': None, # Part can be a special subdivision of a volume or it can be the
# highest level division of the journal. Parts are often designated with letters
# or names, i.e. "B", "Supplement".
},
}
def _encoded(value):
if not isinstance(value, basestring):
value = '%s' % value
if isinstance(value, unicode):
return value.encode('utf8')
try:
value.decode('utf8')
return value
except UnicodeDecodeError:
return value.decode('latin1').encode('utf8')
[docs]class ContextObject(list, UnicodeMixin):
"""
>>> c = ContextObject('sid', 'journal', ('jtitle', '\xe2'))
>>> assert '%C3%A2' in c.span_attrs()['title']
>>> c = ContextObject('sid', 'journal', ('jtitle', u'\xe2'))
>>> assert '%C3%A2' in c.span_attrs()['title']
"""
def __init__(self, sid, mtx, *data):
self.sid = sid
self.mtx = mtx
list.__init__(self)
for key, val in data:
validator = FIELDS[self.mtx].get(key, FIELDS['any'].get(key)) \
or (lambda v: '%s' % v)
key = 'rft.' + key if not key.startswith('rft') else key
self.append((key, validator(val)))
@classmethod
def from_bibtex(cls, sid, rec):
mtx, genre = {
'article': ('journal', 'article'),
'book': ('book', 'book'),
'inbook': ('book', 'bookitem'),
'incollection': ('book', 'bookitem'),
'inproceedings': ('book', 'proceeding'),
'conference': ('book', 'proceeding'),
'mastersthesis': ('dissertation', None),
'phdthesis': ('dissertation', None),
'proceedings': ('book', 'conference'),
'techreport': ('book', 'report'),
'unpublished': ('book', 'document'),
'misc': ('book', 'unknown'),
}.get(getattr(rec.genre, 'value', rec.genre), # allow EnumSymbol as genre.
('book', 'document' if rec.get('author') else 'unknown'))
if genre:
data = [('genre', genre)]
else:
data = []
if mtx == 'journal':
if 'title' in rec:
data.append(('atitle', rec['title']))
if 'journal' in rec:
data.append(('jtitle', rec['journal']))
elif mtx == 'book':
if 'title' in rec:
data.append(('btitle' if genre == 'book' else 'atitle', rec['title']))
if 'booktitle' in rec:
if genre != 'book':
data.append(('btitle', rec['booktitle']))
elif mtx == 'dissertation':
if 'title' in rec:
data.append(('title', rec['title']))
data.append(
('degree', 'phd' if
getattr(rec.genre, 'value', rec.genre) == 'phdthesis' else 'masters'))
data.append(('inst', rec.get('school', rec.get('institution', ''))))
if 'url' in rec:
data.append(('rft_id', rec['url']))
for bibfield, openurlfield in {
'address': 'place',
'publisher': 'pub',
'year': 'date',
'volume': 'volume',
'number': 'issue',
'series': 'series',
'edition': 'edition',
'pages': 'pages',
}.items():
if bibfield in rec:
data.append((openurlfield, rec[bibfield]))
for i, author in enumerate(rec.getall('author')):
if i == 0:
parts = re.split('\s*,\s*', author, 1)
if len(parts) == 1:
parts = re.split('\s+', author)
last = parts[-1]
first = ' '.join(parts[:-1])
else:
last, first = parts
data.append(('aulast', last))
data.append(('aufirst', first))
else:
data.append(('au', author))
#annote: An annotation for annotated bibliography styles (not typical)
#chapter: The chapter number
#crossref: The key of the cross-referenced entry
#editor: The name(s) of the editor(s)
#eprint: A specification of an electronic publication, often a preprint or a
# technical report
#howpublished: How it was published, if the publishing method is nonstandard
#institution: The institution that was involved in the publishing, but not
# necessarily the publisher
#key: A hidden field used for specifying or overriding the alphabetical order of
# entries (when the "author" and "editor" fields are missing). Note that this
# is very different from the key (mentioned just after this list) that is used
# to cite or cross-reference the entry.
#month: The month of publication (or, if unpublished, the month of creation)
#note: Miscellaneous extra information
#organization: The conference sponsor
#type: The field overriding the default type of publication (e.g. "Research Note"
# for techreport, "{PhD} dissertation" for phdthesis, "Section" for
# inbook/incollection)
return cls(sid, mtx, *data)
def __unicode__(self):
pairs = [
('ctx_ver', 'Z39.88-2004'),
('rft_val_fmt', 'info:ofi/fmt:kev:mtx:' + _encoded(self.mtx)),
('rfr_id', 'info:sid/' + _encoded(self.sid))]
for pair in self:
pairs.append((_encoded(pair[0]), _encoded(pair[1])))
return urlencode(pairs)
def span_attrs(self):
return {'class': 'Z3988', 'title': self.__unicode__()}