SickGear/lib/langcodes/registry_parser.py

60 lines
1.7 KiB
Python
Raw Normal View History

from langcodes.util import data_filename
LIST_KEYS = {'Description', 'Prefix'}
def parse_file(file):
"""
Take an open file containing the IANA subtag registry, and yield a
dictionary of information for each subtag it describes.
"""
lines = []
for line in file:
line = line.rstrip('\n')
if line == '%%':
# This is a separator between items. Parse the data we've
# collected and yield the result.
yield from parse_item(lines)
lines.clear()
elif line.startswith(' '):
# This is a continuation line. Concatenate it to the previous
# line, including one of the spaces.
lines[-1] += line[1:]
else:
lines.append(line)
yield from parse_item(lines)
def parse_item(lines):
"""
Given the lines that form a subtag entry (after joining wrapped lines
back together), parse the data they contain.
Returns a generator that yields once if there was any data there
(and an empty generator if this was just the header).
"""
info = {}
for line in lines:
key, value = line.split(': ', 1)
if key in LIST_KEYS:
info.setdefault(key, []).append(value)
else:
assert key not in info
info[key] = value
if 'Subtag' in info or 'Tag' in info:
yield info
def parse_registry():
"""
Yield a sequence of dictionaries, containing the info in the included
IANA subtag registry file.
"""
with open(
data_filename('language-subtag-registry.txt'), encoding='utf-8'
) as data_file:
# 'yield from' instead of returning, so that we only close the file
# when finished.
yield from parse_file(data_file)