Source code for clld.lib.iso

"""Functionality to gather information about iso-639-3 codes from sil.org."""
# http://www-01.sil.org/iso639-3/iso-639-3_20130531.tab
#
# Id      char(3) NOT NULL,  -- The three-letter 639-3 identifier
# Part2B  char(3) NULL,      -- Equivalent 639-2 identifier of the bibliographic
#                               applications
#                            -- code set, if there is one
# Part2T  char(3) NULL,      -- Equivalent 639-2 identifier of the terminology
#                               applications code
#                            -- set, if there is one
# Part1   char(2) NULL,      -- Equivalent 639-1 identifier, if there is one
# Scope   char(1) NOT NULL,  -- I(ndividual), M(acrolanguage), S(pecial)
# Type    char(1) NOT NULL,  -- A(ncient), C(onstructed),
#                            -- E(xtinct), H(istorical), L(iving), S(pecial)
# Ref_Name   varchar(150) NOT NULL,   -- Reference language name
# Comment    varchar(150) NULL)       -- Comment relating to one or more of the columns
#
#
# http://www.sil.org/iso639-3/iso-639-3_Name_Index_20130520.tab
#
# Id             char(3)     NOT NULL,  -- The three-letter 639-3 identifier
# Print_Name     varchar(75) NOT NULL,  -- One of the names associated with this
#                                       -- identifier
# Inverted_Name  varchar(75) NOT NULL)  -- The inverted form of this Print_Name form
#
#
# http://www-01.sil.org/iso639-3/iso-639-3-macrolanguages_20130314.tab
#
# M_Id      char(3) NOT NULL,   -- The identifier for a macrolanguage
# I_Id      char(3) NOT NULL,   -- The identifier for an individual language
#                               -- that is a member of the macrolanguage
# I_Status  char(1) NOT NULL)   -- A (active) or R (retired) indicating the
#                               -- status of the individual code element
#
#
# http://www-01.sil.org/iso639-3/iso-639-3_Retirements_20130531.tab
#
# Id          char(3)      NOT NULL,     -- The three-letter 639-3 identifier
# Ref_Name    varchar(150) NOT NULL,     -- reference name of language
# Ret_Reason  char(1)      NOT NULL,     -- code for retirement: C (change),
#                                        -- D (duplicate), N (non-existent), S (split),
#                                        -- M (merge)
# Change_To   char(3)      NULL,         -- in the cases of C, D, and M, the identifier
#                                        -- to which all instances of this Id should be
#                                        -- changed
# Ret_Remedy  varchar(300) NULL,         -- The instructions for updating an instance
#                                        -- of the retired (split) identifier
# Effective   date         NOT NULL)     -- The date the retirement became effective
#
import re

import requests
from bs4 import BeautifulSoup as bs

from clldutils import dsv


TAB_NAME_PATTERN = re.compile(
    'iso-639-3(?P<name>_Name_Index|\-macrolanguages|_Retirements)'
    '?(_(?P<date>[0-9]{8}))?\.tab$')


[docs]def get(path): """Retrieve a resource from the sil site and return it's representation.""" return requests.get("http://www-01.sil.org/iso639-3/" + path).content
[docs]def get_taburls(): """Retrieve the current (date-stamped) file names for download files from sil.""" soup = bs(get('download.asp'), "html5lib") name_map = { None: 'codes', '_Name_Index': 'names', '-macrolanguages': 'macrolanguages', '_Retirements': 'retired', } res = {} for a in soup.find_all('a', href=True): match = TAB_NAME_PATTERN.match(a['href']) if match: res[name_map.get(match.group('name'))] = a['href'] return res
[docs]def get_tab(name): """Generator for entries in a tab file specified by name.""" return dsv.reader( get(get_taburls()[name]).split('\n'), namedtuples=True, delimiter='\t')
def _text(e): return e.text.strip()
[docs]def get_documentation(code): """Scrape information about a iso 639-3 code from the documentation page.""" soup = bs(get('documentation.asp?id=' + code), "html5lib") assert code in soup.find_all('h1', limit=1)[0].text info = {} for table in soup.find_all('table'): for tr in table.find_all('tr'): tds = tr.find_all('td') if len(tds) == 2: info[_text(tds[0]).replace(':', '')] = _text(tds[1]) return info