From 42a0999b5f7d2248b5bb04a9be6f70dff9efcb9d Mon Sep 17 00:00:00 2001 From: JackDandy Date: Wed, 1 Feb 2017 13:29:38 +0000 Subject: [PATCH] Add webencodings 0.5 (3970651) to assist parsing legacy web content. --- CHANGES.md | 1 + lib/webencodings/__init__.py | 342 +++++++++++++++++++++++++++++ lib/webencodings/labels.py | 231 +++++++++++++++++++ lib/webencodings/mklabels.py | 59 +++++ lib/webencodings/tests.py | 153 +++++++++++++ lib/webencodings/x_user_defined.py | 325 +++++++++++++++++++++++++++ 6 files changed, 1111 insertions(+) create mode 100644 lib/webencodings/__init__.py create mode 100644 lib/webencodings/labels.py create mode 100644 lib/webencodings/mklabels.py create mode 100644 lib/webencodings/tests.py create mode 100644 lib/webencodings/x_user_defined.py diff --git a/CHANGES.md b/CHANGES.md index e3aa341d..e5916618 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ * Change improve page load time when loading images * Update isotope library 2.2.2 to 3.0.1 * Add lazyload package 3.0.0 (2e318b1) +* Add webencodings 0.5 (3970651) to assist parsing legacy web content * Change improve add show search results by comparing search term to an additional unidecoded result set * Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups * Update backports_abc 0.4 to 0.5 diff --git a/lib/webencodings/__init__.py b/lib/webencodings/__init__.py new file mode 100644 index 00000000..03d5d357 --- /dev/null +++ b/lib/webencodings/__init__.py @@ -0,0 +1,342 @@ +# coding: utf8 +""" + + webencodings + ~~~~~~~~~~~~ + + This is a Python implementation of the `WHATWG Encoding standard + `. See README for details. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +from __future__ import unicode_literals + +import codecs + +from .labels import LABELS + + +VERSION = '0.5' + + +# Some names in Encoding are not valid Python aliases. Remap these. +PYTHON_NAMES = { + 'iso-8859-8-i': 'iso-8859-8', + 'x-mac-cyrillic': 'mac-cyrillic', + 'macintosh': 'mac-roman', + 'windows-874': 'cp874'} + +CACHE = {} + + +def ascii_lower(string): + r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. + + :param string: An Unicode string. + :returns: A new Unicode string. + + This is used for `ASCII case-insensitive + `_ + matching of encoding labels. + The same matching is also used, among other things, + for `CSS keywords `_. + + This is different from the :meth:`~py:str.lower` method of Unicode strings + which also affect non-ASCII characters, + sometimes mapping them into the ASCII range: + + >>> keyword = u'Bac\N{KELVIN SIGN}ground' + >>> assert keyword.lower() == u'background' + >>> assert ascii_lower(keyword) != keyword.lower() + >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' + + """ + # This turns out to be faster than unicode.translate() + return string.encode('utf8').lower().decode('utf8') + + +def lookup(label): + """ + Look for an encoding by its label. + This is the spec’s `get an encoding + `_ algorithm. + Supported labels are listed there. + + :param label: A string. + :returns: + An :class:`Encoding` object, or :obj:`None` for an unknown label. + + """ + # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. + label = ascii_lower(label.strip('\t\n\f\r ')) + name = LABELS.get(label) + if name is None: + return None + encoding = CACHE.get(name) + if encoding is None: + if name == 'x-user-defined': + from .x_user_defined import codec_info + else: + python_name = PYTHON_NAMES.get(name, name) + # Any python_name value that gets to here should be valid. + codec_info = codecs.lookup(python_name) + encoding = Encoding(name, codec_info) + CACHE[name] = encoding + return encoding + + +def _get_encoding(encoding_or_label): + """ + Accept either an encoding object or label. + + :param encoding: An :class:`Encoding` object or a label string. + :returns: An :class:`Encoding` object. + :raises: :exc:`~exceptions.LookupError` for an unknown label. + + """ + if hasattr(encoding_or_label, 'codec_info'): + return encoding_or_label + + encoding = lookup(encoding_or_label) + if encoding is None: + raise LookupError('Unknown encoding label: %r' % encoding_or_label) + return encoding + + +class Encoding(object): + """Reresents a character encoding such as UTF-8, + that can be used for decoding or encoding. + + .. attribute:: name + + Canonical name of the encoding + + .. attribute:: codec_info + + The actual implementation of the encoding, + a stdlib :class:`~codecs.CodecInfo` object. + See :func:`codecs.register`. + + """ + def __init__(self, name, codec_info): + self.name = name + self.codec_info = codec_info + + def __repr__(self): + return '' % self.name + + +#: The UTF-8 encoding. Should be used for new content and formats. +UTF8 = lookup('utf-8') + +_UTF16LE = lookup('utf-16le') +_UTF16BE = lookup('utf-16be') + + +def decode(input, fallback_encoding, errors='replace'): + """ + Decode a single string. + + :param input: A byte string + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :return: + A ``(output, encoding)`` tuple of an Unicode string + and an :obj:`Encoding`. + + """ + # Fail early if `encoding` is an invalid label. + fallback_encoding = _get_encoding(fallback_encoding) + bom_encoding, input = _detect_bom(input) + encoding = bom_encoding or fallback_encoding + return encoding.codec_info.decode(input, errors)[0], encoding + + +def _detect_bom(input): + """Return (bom_encoding, input), with any BOM removed from the input.""" + if input.startswith(b'\xFF\xFE'): + return _UTF16LE, input[2:] + if input.startswith(b'\xFE\xFF'): + return _UTF16BE, input[2:] + if input.startswith(b'\xEF\xBB\xBF'): + return UTF8, input[3:] + return None, input + + +def encode(input, encoding=UTF8, errors='strict'): + """ + Encode a single string. + + :param input: An Unicode string. + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :return: A byte string. + + """ + return _get_encoding(encoding).codec_info.encode(input, errors)[0] + + +def iter_decode(input, fallback_encoding, errors='replace'): + """ + "Pull"-based decoder. + + :param input: + An iterable of byte strings. + + The input is first consumed just enough to determine the encoding + based on the precense of a BOM, + then consumed on demand when the return value is. + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :returns: + An ``(output, encoding)`` tuple. + :obj:`output` is an iterable of Unicode strings, + :obj:`encoding` is the :obj:`Encoding` that is being used. + + """ + + decoder = IncrementalDecoder(fallback_encoding, errors) + generator = _iter_decode_generator(input, decoder) + encoding = next(generator) + return generator, encoding + + +def _iter_decode_generator(input, decoder): + """Return a generator that first yields the :obj:`Encoding`, + then yields output chukns as Unicode strings. + + """ + decode = decoder.decode + input = iter(input) + for chunck in input: + output = decode(chunck) + if output: + assert decoder.encoding is not None + yield decoder.encoding + yield output + break + else: + # Input exhausted without determining the encoding + output = decode(b'', final=True) + assert decoder.encoding is not None + yield decoder.encoding + if output: + yield output + return + + for chunck in input: + output = decode(chunck) + if output: + yield output + output = decode(b'', final=True) + if output: + yield output + + +def iter_encode(input, encoding=UTF8, errors='strict'): + """ + “Pull”-based encoder. + + :param input: An iterable of Unicode strings. + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + :returns: An iterable of byte strings. + + """ + # Fail early if `encoding` is an invalid label. + encode = IncrementalEncoder(encoding, errors).encode + return _iter_encode_generator(input, encode) + + +def _iter_encode_generator(input, encode): + for chunck in input: + output = encode(chunck) + if output: + yield output + output = encode('', final=True) + if output: + yield output + + +class IncrementalDecoder(object): + """ + “Push”-based decoder. + + :param fallback_encoding: + An :class:`Encoding` object or a label string. + The encoding to use if :obj:`input` does note have a BOM. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + + """ + def __init__(self, fallback_encoding, errors='replace'): + # Fail early if `encoding` is an invalid label. + self._fallback_encoding = _get_encoding(fallback_encoding) + self._errors = errors + self._buffer = b'' + self._decoder = None + #: The actual :class:`Encoding` that is being used, + #: or :obj:`None` if that is not determined yet. + #: (Ie. if there is not enough input yet to determine + #: if there is a BOM.) + self.encoding = None # Not known yet. + + def decode(self, input, final=False): + """Decode one chunk of the input. + + :param input: A byte string. + :param final: + Indicate that no more input is available. + Must be :obj:`True` if this is the last call. + :returns: An Unicode string. + + """ + decoder = self._decoder + if decoder is not None: + return decoder(input, final) + + input = self._buffer + input + encoding, input = _detect_bom(input) + if encoding is None: + if len(input) < 3 and not final: # Not enough data yet. + self._buffer = input + return '' + else: # No BOM + encoding = self._fallback_encoding + decoder = encoding.codec_info.incrementaldecoder(self._errors).decode + self._decoder = decoder + self.encoding = encoding + return decoder(input, final) + + +class IncrementalEncoder(object): + """ + “Push”-based encoder. + + :param encoding: An :class:`Encoding` object or a label string. + :param errors: Type of error handling. See :func:`codecs.register`. + :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. + + .. method:: encode(input, final=False) + + :param input: An Unicode string. + :param final: + Indicate that no more input is available. + Must be :obj:`True` if this is the last call. + :returns: A byte string. + + """ + def __init__(self, encoding=UTF8, errors='strict'): + encoding = _get_encoding(encoding) + self.encode = encoding.codec_info.incrementalencoder(errors).encode diff --git a/lib/webencodings/labels.py b/lib/webencodings/labels.py new file mode 100644 index 00000000..29cbf91e --- /dev/null +++ b/lib/webencodings/labels.py @@ -0,0 +1,231 @@ +""" + + webencodings.labels + ~~~~~~~~~~~~~~~~~~~ + + Map encoding labels to their name. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +# XXX Do not edit! +# This file is automatically generated by mklabels.py + +LABELS = { + 'unicode-1-1-utf-8': 'utf-8', + 'utf-8': 'utf-8', + 'utf8': 'utf-8', + '866': 'ibm866', + 'cp866': 'ibm866', + 'csibm866': 'ibm866', + 'ibm866': 'ibm866', + 'csisolatin2': 'iso-8859-2', + 'iso-8859-2': 'iso-8859-2', + 'iso-ir-101': 'iso-8859-2', + 'iso8859-2': 'iso-8859-2', + 'iso88592': 'iso-8859-2', + 'iso_8859-2': 'iso-8859-2', + 'iso_8859-2:1987': 'iso-8859-2', + 'l2': 'iso-8859-2', + 'latin2': 'iso-8859-2', + 'csisolatin3': 'iso-8859-3', + 'iso-8859-3': 'iso-8859-3', + 'iso-ir-109': 'iso-8859-3', + 'iso8859-3': 'iso-8859-3', + 'iso88593': 'iso-8859-3', + 'iso_8859-3': 'iso-8859-3', + 'iso_8859-3:1988': 'iso-8859-3', + 'l3': 'iso-8859-3', + 'latin3': 'iso-8859-3', + 'csisolatin4': 'iso-8859-4', + 'iso-8859-4': 'iso-8859-4', + 'iso-ir-110': 'iso-8859-4', + 'iso8859-4': 'iso-8859-4', + 'iso88594': 'iso-8859-4', + 'iso_8859-4': 'iso-8859-4', + 'iso_8859-4:1988': 'iso-8859-4', + 'l4': 'iso-8859-4', + 'latin4': 'iso-8859-4', + 'csisolatincyrillic': 'iso-8859-5', + 'cyrillic': 'iso-8859-5', + 'iso-8859-5': 'iso-8859-5', + 'iso-ir-144': 'iso-8859-5', + 'iso8859-5': 'iso-8859-5', + 'iso88595': 'iso-8859-5', + 'iso_8859-5': 'iso-8859-5', + 'iso_8859-5:1988': 'iso-8859-5', + 'arabic': 'iso-8859-6', + 'asmo-708': 'iso-8859-6', + 'csiso88596e': 'iso-8859-6', + 'csiso88596i': 'iso-8859-6', + 'csisolatinarabic': 'iso-8859-6', + 'ecma-114': 'iso-8859-6', + 'iso-8859-6': 'iso-8859-6', + 'iso-8859-6-e': 'iso-8859-6', + 'iso-8859-6-i': 'iso-8859-6', + 'iso-ir-127': 'iso-8859-6', + 'iso8859-6': 'iso-8859-6', + 'iso88596': 'iso-8859-6', + 'iso_8859-6': 'iso-8859-6', + 'iso_8859-6:1987': 'iso-8859-6', + 'csisolatingreek': 'iso-8859-7', + 'ecma-118': 'iso-8859-7', + 'elot_928': 'iso-8859-7', + 'greek': 'iso-8859-7', + 'greek8': 'iso-8859-7', + 'iso-8859-7': 'iso-8859-7', + 'iso-ir-126': 'iso-8859-7', + 'iso8859-7': 'iso-8859-7', + 'iso88597': 'iso-8859-7', + 'iso_8859-7': 'iso-8859-7', + 'iso_8859-7:1987': 'iso-8859-7', + 'sun_eu_greek': 'iso-8859-7', + 'csiso88598e': 'iso-8859-8', + 'csisolatinhebrew': 'iso-8859-8', + 'hebrew': 'iso-8859-8', + 'iso-8859-8': 'iso-8859-8', + 'iso-8859-8-e': 'iso-8859-8', + 'iso-ir-138': 'iso-8859-8', + 'iso8859-8': 'iso-8859-8', + 'iso88598': 'iso-8859-8', + 'iso_8859-8': 'iso-8859-8', + 'iso_8859-8:1988': 'iso-8859-8', + 'visual': 'iso-8859-8', + 'csiso88598i': 'iso-8859-8-i', + 'iso-8859-8-i': 'iso-8859-8-i', + 'logical': 'iso-8859-8-i', + 'csisolatin6': 'iso-8859-10', + 'iso-8859-10': 'iso-8859-10', + 'iso-ir-157': 'iso-8859-10', + 'iso8859-10': 'iso-8859-10', + 'iso885910': 'iso-8859-10', + 'l6': 'iso-8859-10', + 'latin6': 'iso-8859-10', + 'iso-8859-13': 'iso-8859-13', + 'iso8859-13': 'iso-8859-13', + 'iso885913': 'iso-8859-13', + 'iso-8859-14': 'iso-8859-14', + 'iso8859-14': 'iso-8859-14', + 'iso885914': 'iso-8859-14', + 'csisolatin9': 'iso-8859-15', + 'iso-8859-15': 'iso-8859-15', + 'iso8859-15': 'iso-8859-15', + 'iso885915': 'iso-8859-15', + 'iso_8859-15': 'iso-8859-15', + 'l9': 'iso-8859-15', + 'iso-8859-16': 'iso-8859-16', + 'cskoi8r': 'koi8-r', + 'koi': 'koi8-r', + 'koi8': 'koi8-r', + 'koi8-r': 'koi8-r', + 'koi8_r': 'koi8-r', + 'koi8-u': 'koi8-u', + 'csmacintosh': 'macintosh', + 'mac': 'macintosh', + 'macintosh': 'macintosh', + 'x-mac-roman': 'macintosh', + 'dos-874': 'windows-874', + 'iso-8859-11': 'windows-874', + 'iso8859-11': 'windows-874', + 'iso885911': 'windows-874', + 'tis-620': 'windows-874', + 'windows-874': 'windows-874', + 'cp1250': 'windows-1250', + 'windows-1250': 'windows-1250', + 'x-cp1250': 'windows-1250', + 'cp1251': 'windows-1251', + 'windows-1251': 'windows-1251', + 'x-cp1251': 'windows-1251', + 'ansi_x3.4-1968': 'windows-1252', + 'ascii': 'windows-1252', + 'cp1252': 'windows-1252', + 'cp819': 'windows-1252', + 'csisolatin1': 'windows-1252', + 'ibm819': 'windows-1252', + 'iso-8859-1': 'windows-1252', + 'iso-ir-100': 'windows-1252', + 'iso8859-1': 'windows-1252', + 'iso88591': 'windows-1252', + 'iso_8859-1': 'windows-1252', + 'iso_8859-1:1987': 'windows-1252', + 'l1': 'windows-1252', + 'latin1': 'windows-1252', + 'us-ascii': 'windows-1252', + 'windows-1252': 'windows-1252', + 'x-cp1252': 'windows-1252', + 'cp1253': 'windows-1253', + 'windows-1253': 'windows-1253', + 'x-cp1253': 'windows-1253', + 'cp1254': 'windows-1254', + 'csisolatin5': 'windows-1254', + 'iso-8859-9': 'windows-1254', + 'iso-ir-148': 'windows-1254', + 'iso8859-9': 'windows-1254', + 'iso88599': 'windows-1254', + 'iso_8859-9': 'windows-1254', + 'iso_8859-9:1989': 'windows-1254', + 'l5': 'windows-1254', + 'latin5': 'windows-1254', + 'windows-1254': 'windows-1254', + 'x-cp1254': 'windows-1254', + 'cp1255': 'windows-1255', + 'windows-1255': 'windows-1255', + 'x-cp1255': 'windows-1255', + 'cp1256': 'windows-1256', + 'windows-1256': 'windows-1256', + 'x-cp1256': 'windows-1256', + 'cp1257': 'windows-1257', + 'windows-1257': 'windows-1257', + 'x-cp1257': 'windows-1257', + 'cp1258': 'windows-1258', + 'windows-1258': 'windows-1258', + 'x-cp1258': 'windows-1258', + 'x-mac-cyrillic': 'x-mac-cyrillic', + 'x-mac-ukrainian': 'x-mac-cyrillic', + 'chinese': 'gbk', + 'csgb2312': 'gbk', + 'csiso58gb231280': 'gbk', + 'gb2312': 'gbk', + 'gb_2312': 'gbk', + 'gb_2312-80': 'gbk', + 'gbk': 'gbk', + 'iso-ir-58': 'gbk', + 'x-gbk': 'gbk', + 'gb18030': 'gb18030', + 'hz-gb-2312': 'hz-gb-2312', + 'big5': 'big5', + 'big5-hkscs': 'big5', + 'cn-big5': 'big5', + 'csbig5': 'big5', + 'x-x-big5': 'big5', + 'cseucpkdfmtjapanese': 'euc-jp', + 'euc-jp': 'euc-jp', + 'x-euc-jp': 'euc-jp', + 'csiso2022jp': 'iso-2022-jp', + 'iso-2022-jp': 'iso-2022-jp', + 'csshiftjis': 'shift_jis', + 'ms_kanji': 'shift_jis', + 'shift-jis': 'shift_jis', + 'shift_jis': 'shift_jis', + 'sjis': 'shift_jis', + 'windows-31j': 'shift_jis', + 'x-sjis': 'shift_jis', + 'cseuckr': 'euc-kr', + 'csksc56011987': 'euc-kr', + 'euc-kr': 'euc-kr', + 'iso-ir-149': 'euc-kr', + 'korean': 'euc-kr', + 'ks_c_5601-1987': 'euc-kr', + 'ks_c_5601-1989': 'euc-kr', + 'ksc5601': 'euc-kr', + 'ksc_5601': 'euc-kr', + 'windows-949': 'euc-kr', + 'csiso2022kr': 'iso-2022-kr', + 'iso-2022-kr': 'iso-2022-kr', + 'utf-16be': 'utf-16be', + 'utf-16': 'utf-16le', + 'utf-16le': 'utf-16le', + 'x-user-defined': 'x-user-defined', +} diff --git a/lib/webencodings/mklabels.py b/lib/webencodings/mklabels.py new file mode 100644 index 00000000..295dc928 --- /dev/null +++ b/lib/webencodings/mklabels.py @@ -0,0 +1,59 @@ +""" + + webencodings.mklabels + ~~~~~~~~~~~~~~~~~~~~~ + + Regenarate the webencodings.labels module. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +import json +try: + from urllib import urlopen +except ImportError: + from urllib.request import urlopen + + +def assert_lower(string): + assert string == string.lower() + return string + + +def generate(url): + parts = ['''\ +""" + + webencodings.labels + ~~~~~~~~~~~~~~~~~~~ + + Map encoding labels to their name. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +# XXX Do not edit! +# This file is automatically generated by mklabels.py + +LABELS = { +'''] + labels = [ + (repr(assert_lower(label)).lstrip('u'), + repr(encoding['name']).lstrip('u')) + for category in json.loads(urlopen(url).read().decode('ascii')) + for encoding in category['encodings'] + for label in encoding['labels']] + max_len = max(len(label) for label, name in labels) + parts.extend( + ' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name) + for label, name in labels) + parts.append('}') + return ''.join(parts) + + +if __name__ == '__main__': + print(generate('http://encoding.spec.whatwg.org/encodings.json')) diff --git a/lib/webencodings/tests.py b/lib/webencodings/tests.py new file mode 100644 index 00000000..b8c5653e --- /dev/null +++ b/lib/webencodings/tests.py @@ -0,0 +1,153 @@ +# coding: utf8 +""" + + webencodings.tests + ~~~~~~~~~~~~~~~~~~ + + A basic test suite for Encoding. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +from __future__ import unicode_literals + +from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode, + IncrementalDecoder, IncrementalEncoder, UTF8) + + +def assert_raises(exception, function, *args, **kwargs): + try: + function(*args, **kwargs) + except exception: + return + else: # pragma: no cover + raise AssertionError('Did not raise %s.' % exception) + + +def test_labels(): + assert lookup('utf-8').name == 'utf-8' + assert lookup('Utf-8').name == 'utf-8' + assert lookup('UTF-8').name == 'utf-8' + assert lookup('utf8').name == 'utf-8' + assert lookup('utf8').name == 'utf-8' + assert lookup('utf8 ').name == 'utf-8' + assert lookup(' \r\nutf8\t').name == 'utf-8' + assert lookup('u8') is None # Python label. + assert lookup('utf-8 ') is None # Non-ASCII white space. + + assert lookup('US-ASCII').name == 'windows-1252' + assert lookup('iso-8859-1').name == 'windows-1252' + assert lookup('latin1').name == 'windows-1252' + assert lookup('LATIN1').name == 'windows-1252' + assert lookup('latin-1') is None + assert lookup('LATİN1') is None # ASCII-only case insensitivity. + + +def test_all_labels(): + for label in LABELS: + assert decode(b'', label) == ('', lookup(label)) + assert encode('', label) == b'' + for repeat in [0, 1, 12]: + output, _ = iter_decode([b''] * repeat, label) + assert list(output) == [] + assert list(iter_encode([''] * repeat, label)) == [] + decoder = IncrementalDecoder(label) + assert decoder.decode(b'') == '' + assert decoder.decode(b'', final=True) == '' + encoder = IncrementalEncoder(label) + assert encoder.encode('') == b'' + assert encoder.encode('', final=True) == b'' + # All encoding names are valid labels too: + for name in set(LABELS.values()): + assert lookup(name).name == name + + +def test_invalid_label(): + assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid') + assert_raises(LookupError, encode, 'é', 'invalid') + assert_raises(LookupError, iter_decode, [], 'invalid') + assert_raises(LookupError, iter_encode, [], 'invalid') + assert_raises(LookupError, IncrementalDecoder, 'invalid') + assert_raises(LookupError, IncrementalEncoder, 'invalid') + + +def test_decode(): + assert decode(b'\x80', 'latin1') == ('€', lookup('latin1')) + assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1')) + assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8')) + assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8')) + assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii')) + assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM + + assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM + assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM + assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be')) + assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le')) + + assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be')) + assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le')) + assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le')) + + assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be')) + assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le')) + assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le')) + + +def test_encode(): + assert encode('é', 'latin1') == b'\xe9' + assert encode('é', 'utf8') == b'\xc3\xa9' + assert encode('é', 'utf8') == b'\xc3\xa9' + assert encode('é', 'utf-16') == b'\xe9\x00' + assert encode('é', 'utf-16le') == b'\xe9\x00' + assert encode('é', 'utf-16be') == b'\x00\xe9' + + +def test_iter_decode(): + def iter_decode_to_string(input, fallback_encoding): + output, _encoding = iter_decode(input, fallback_encoding) + return ''.join(output) + assert iter_decode_to_string([], 'latin1') == '' + assert iter_decode_to_string([b''], 'latin1') == '' + assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é' + assert iter_decode_to_string([b'hello'], 'latin1') == 'hello' + assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello' + assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello' + assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD' + assert iter_decode_to_string([ + b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é' + assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == '' + assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»' + assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é' + assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é' + assert iter_decode_to_string([ + b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo' + + +def test_iter_encode(): + assert b''.join(iter_encode([], 'latin1')) == b'' + assert b''.join(iter_encode([''], 'latin1')) == b'' + assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9' + assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9' + assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00' + assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00' + assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9' + assert b''.join(iter_encode([ + '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo' + + +def test_x_user_defined(): + encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca' + decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca' + encoded = b'aa' + decoded = 'aa' + assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined')) + assert encode(decoded, 'x-user-defined') == encoded diff --git a/lib/webencodings/x_user_defined.py b/lib/webencodings/x_user_defined.py new file mode 100644 index 00000000..f0daa11a --- /dev/null +++ b/lib/webencodings/x_user_defined.py @@ -0,0 +1,325 @@ +# coding: utf8 +""" + + webencodings.x_user_defined + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + An implementation of the x-user-defined encoding. + + :copyright: Copyright 2012 by Simon Sapin + :license: BSD, see LICENSE for details. + +""" + +from __future__ import unicode_literals + +import codecs + + +### Codec APIs + +class Codec(codecs.Codec): + + def encode(self, input, errors='strict'): + return codecs.charmap_encode(input, errors, encoding_table) + + def decode(self, input, errors='strict'): + return codecs.charmap_decode(input, errors, decoding_table) + + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.charmap_encode(input, self.errors, encoding_table)[0] + + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return codecs.charmap_decode(input, self.errors, decoding_table)[0] + + +class StreamWriter(Codec, codecs.StreamWriter): + pass + + +class StreamReader(Codec, codecs.StreamReader): + pass + + +### encodings module API + +codec_info = codecs.CodecInfo( + name='x-user-defined', + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, +) + + +### Decoding Table + +# Python 3: +# for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700)) +decoding_table = ( + '\x00' + '\x01' + '\x02' + '\x03' + '\x04' + '\x05' + '\x06' + '\x07' + '\x08' + '\t' + '\n' + '\x0b' + '\x0c' + '\r' + '\x0e' + '\x0f' + '\x10' + '\x11' + '\x12' + '\x13' + '\x14' + '\x15' + '\x16' + '\x17' + '\x18' + '\x19' + '\x1a' + '\x1b' + '\x1c' + '\x1d' + '\x1e' + '\x1f' + ' ' + '!' + '"' + '#' + '$' + '%' + '&' + "'" + '(' + ')' + '*' + '+' + ',' + '-' + '.' + '/' + '0' + '1' + '2' + '3' + '4' + '5' + '6' + '7' + '8' + '9' + ':' + ';' + '<' + '=' + '>' + '?' + '@' + 'A' + 'B' + 'C' + 'D' + 'E' + 'F' + 'G' + 'H' + 'I' + 'J' + 'K' + 'L' + 'M' + 'N' + 'O' + 'P' + 'Q' + 'R' + 'S' + 'T' + 'U' + 'V' + 'W' + 'X' + 'Y' + 'Z' + '[' + '\\' + ']' + '^' + '_' + '`' + 'a' + 'b' + 'c' + 'd' + 'e' + 'f' + 'g' + 'h' + 'i' + 'j' + 'k' + 'l' + 'm' + 'n' + 'o' + 'p' + 'q' + 'r' + 's' + 't' + 'u' + 'v' + 'w' + 'x' + 'y' + 'z' + '{' + '|' + '}' + '~' + '\x7f' + '\uf780' + '\uf781' + '\uf782' + '\uf783' + '\uf784' + '\uf785' + '\uf786' + '\uf787' + '\uf788' + '\uf789' + '\uf78a' + '\uf78b' + '\uf78c' + '\uf78d' + '\uf78e' + '\uf78f' + '\uf790' + '\uf791' + '\uf792' + '\uf793' + '\uf794' + '\uf795' + '\uf796' + '\uf797' + '\uf798' + '\uf799' + '\uf79a' + '\uf79b' + '\uf79c' + '\uf79d' + '\uf79e' + '\uf79f' + '\uf7a0' + '\uf7a1' + '\uf7a2' + '\uf7a3' + '\uf7a4' + '\uf7a5' + '\uf7a6' + '\uf7a7' + '\uf7a8' + '\uf7a9' + '\uf7aa' + '\uf7ab' + '\uf7ac' + '\uf7ad' + '\uf7ae' + '\uf7af' + '\uf7b0' + '\uf7b1' + '\uf7b2' + '\uf7b3' + '\uf7b4' + '\uf7b5' + '\uf7b6' + '\uf7b7' + '\uf7b8' + '\uf7b9' + '\uf7ba' + '\uf7bb' + '\uf7bc' + '\uf7bd' + '\uf7be' + '\uf7bf' + '\uf7c0' + '\uf7c1' + '\uf7c2' + '\uf7c3' + '\uf7c4' + '\uf7c5' + '\uf7c6' + '\uf7c7' + '\uf7c8' + '\uf7c9' + '\uf7ca' + '\uf7cb' + '\uf7cc' + '\uf7cd' + '\uf7ce' + '\uf7cf' + '\uf7d0' + '\uf7d1' + '\uf7d2' + '\uf7d3' + '\uf7d4' + '\uf7d5' + '\uf7d6' + '\uf7d7' + '\uf7d8' + '\uf7d9' + '\uf7da' + '\uf7db' + '\uf7dc' + '\uf7dd' + '\uf7de' + '\uf7df' + '\uf7e0' + '\uf7e1' + '\uf7e2' + '\uf7e3' + '\uf7e4' + '\uf7e5' + '\uf7e6' + '\uf7e7' + '\uf7e8' + '\uf7e9' + '\uf7ea' + '\uf7eb' + '\uf7ec' + '\uf7ed' + '\uf7ee' + '\uf7ef' + '\uf7f0' + '\uf7f1' + '\uf7f2' + '\uf7f3' + '\uf7f4' + '\uf7f5' + '\uf7f6' + '\uf7f7' + '\uf7f8' + '\uf7f9' + '\uf7fa' + '\uf7fb' + '\uf7fc' + '\uf7fd' + '\uf7fe' + '\uf7ff' +) + +### Encoding table +encoding_table = codecs.charmap_build(decoding_table)