"""
functionality to gather information about iso-639-3 codes from sil.org
"""
#"""
#http://www.sil.org/iso639-3/download.asp
#
#http://www-01.sil.org/iso639-3/iso-639-3_20130531.tab
#CREATE TABLE [ISO_639-3] (
#Id char(3) NOT NULL, -- The three-letter 639-3 identifier
#Part2B char(3) NULL, -- Equivalent 639-2 identifier of the bibliographic
# applications
# -- code set, if there is one
#Part2T char(3) NULL, -- Equivalent 639-2 identifier of the terminology applications
# code
# -- set, if there is one
#Part1 char(2) NULL, -- Equivalent 639-1 identifier, if there is one
#Scope char(1) NOT NULL, -- I(ndividual), M(acrolanguage), S(pecial)
#Type char(1) NOT NULL, -- A(ncient), C(onstructed),
# -- E(xtinct), H(istorical), L(iving), S(pecial)
#Ref_Name varchar(150) NOT NULL, -- Reference language name
#Comment varchar(150) NULL) -- Comment relating to one or more of the columns
#
#
#http://www.sil.org/iso639-3/iso-639-3_Name_Index_20130520.tab
#CREATE TABLE [ISO_639-3_Names] (
#Id char(3) NOT NULL, -- The three-letter 639-3 identifier
#Print_Name varchar(75) NOT NULL, -- One of the names associated with this identifier
#Inverted_Name varchar(75) NOT NULL) -- The inverted form of this Print_Name form
#
#
#http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages_20130314.tab
#CREATE TABLE [ISO_639-3_Macrolanguages] (
#M_Id char(3) NOT NULL, -- The identifier for a macrolanguage
#I_Id char(3) NOT NULL, -- The identifier for an individual language
# -- that is a member of the macrolanguage
#I_Status char(1) NOT NULL) -- A (active) or R (retired) indicating the
# -- status of the individual code element
#
#
#http://www-01.sil.org/iso639-3/iso-639-3_Retirements_20130531.tab
#CREATE TABLE [ISO_639-3_Retirements] (
#Id char(3) NOT NULL, -- The three-letter 639-3 identifier
#Ref_Name varchar(150) NOT NULL, -- reference name of language
#Ret_Reason char(1) NOT NULL, -- code for retirement: C (change), D (duplicate),
# -- N (non-existent), S (split), M (merge)
#Change_To char(3) NULL, -- in the cases of C, D, and M, the identifier
# -- to which all instances of this Id should be
# changed
#Ret_Remedy varchar(300) NULL, -- The instructions for updating an instance
# -- of the retired (split) identifier
#Effective date NOT NULL) -- The date the retirement became effective
#"""
import re
import requests
from bs4 import BeautifulSoup as bs
from clld.lib import dsv
TAB_NAME_PATTERN = re.compile(
'iso-639-3(?P<name>_Name_Index|\-macrolanguages|_Retirements)'
'?(_(?P<date>[0-9]{8}))?\.tab$')
[docs]def get(path):
"""retrieve a resource from the sil site and return it's representation.
"""
return requests.get("http://www.sil.org/iso639-3/" + path).content
[docs]def get_taburls():
"""retrieves the current (date-stamped) file names for download files from sil's
download page.
"""
soup = bs(get('download.asp'))
name_map = {
None: 'codes',
'_Name_Index': 'names',
'-macrolanguages': 'macrolanguages',
'_Retirements': 'retired',
}
res = {}
for a in soup.find_all('a', href=True):
match = TAB_NAME_PATTERN.match(a['href'])
if match:
res[name_map.get(match.group('name'))] = a['href']
return res
[docs]def get_tab(name):
"""generator for entries in a tab file specified by name.
"""
return dsv.reader(get(get_taburls()[name]).split('\n'), namedtuples=True)
def _text(e):
return e.text.strip()
[docs]def get_documentation(code):
"""scrape information about a iso 639-3 code from the documentation page.
"""
soup = bs(get('documentation.asp?id=' + code))
assert code in soup.find_all('h1', limit=1)[0].text
info = {}
table = soup.find_all('table', limit=1)[0]
for tr in table.find_all('tr'):
tds = tr.find_all('td')
assert len(tds) == 2
info[_text(tds[0])] = _text(tds[1])
return info