from langcodes.util import data_filename LIST_KEYS = {'Description', 'Prefix'} def parse_file(file): """ Take an open file containing the IANA subtag registry, and yield a dictionary of information for each subtag it describes. """ lines = [] for line in file: line = line.rstrip('\n') if line == '%%': # This is a separator between items. Parse the data we've # collected and yield the result. yield from parse_item(lines) lines.clear() elif line.startswith(' '): # This is a continuation line. Concatenate it to the previous # line, including one of the spaces. lines[-1] += line[1:] else: lines.append(line) yield from parse_item(lines) def parse_item(lines): """ Given the lines that form a subtag entry (after joining wrapped lines back together), parse the data they contain. Returns a generator that yields once if there was any data there (and an empty generator if this was just the header). """ info = {} for line in lines: key, value = line.split(': ', 1) if key in LIST_KEYS: info.setdefault(key, []).append(value) else: assert key not in info info[key] = value if 'Subtag' in info or 'Tag' in info: yield info def parse_registry(): """ Yield a sequence of dictionaries, containing the info in the included IANA subtag registry file. """ with open( data_filename('language-subtag-registry.txt'), encoding='utf-8' ) as data_file: # 'yield from' instead of returning, so that we only close the file # when finished. yield from parse_file(data_file)