Merge pull request #899 from JackDandy/feature/AddWebencodings

Add webencodings 0.5 (3970651) to assist parsing legacy web content.
2025-01-05 17:43:37 +00:00 · 2017-02-01 13:32:38 +00:00 · 2017-02-01 13:32:38 +00:00 · e75a5d7c34
commit e75a5d7c34
parent 593bc098e3 42a0999b5f
6 changed files with 1111 additions and 0 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -6,6 +6,7 @@
 * Change improve page load time when loading images
 * Update isotope library 2.2.2 to 3.0.1
 * Add lazyload package 3.0.0 (2e318b1)
+* Add webencodings 0.5 (3970651) to assist parsing legacy web content
 * Change improve add show search results by comparing search term to an additional unidecoded result set
 * Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
 * Update backports_abc 0.4 to 0.5
--- a/lib/webencodings/init.py
+++ b/lib/webencodings/init.py
@ -0,0 +1,342 @@
+# coding: utf8
+"""
+
+    webencodings
+    ~~~~~~~~~~~~
+
+    This is a Python implementation of the `WHATWG Encoding standard
+    <http://encoding.spec.whatwg.org/>`. See README for details.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+from .labels import LABELS
+
+
+VERSION = '0.5'
+
+
+# Some names in Encoding are not valid Python aliases. Remap these.
+PYTHON_NAMES = {
+    'iso-8859-8-i': 'iso-8859-8',
+    'x-mac-cyrillic': 'mac-cyrillic',
+    'macintosh': 'mac-roman',
+    'windows-874': 'cp874'}
+
+CACHE = {}
+
+
+def ascii_lower(string):
+    r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
+
+    :param string: An Unicode string.
+    :returns: A new Unicode string.
+
+    This is used for `ASCII case-insensitive
+    <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
+    matching of encoding labels.
+    The same matching is also used, among other things,
+    for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
+
+    This is different from the :meth:`~py:str.lower` method of Unicode strings
+    which also affect non-ASCII characters,
+    sometimes mapping them into the ASCII range:
+
+        >>> keyword = u'Bac\N{KELVIN SIGN}ground'
+        >>> assert keyword.lower() == u'background'
+        >>> assert ascii_lower(keyword) != keyword.lower()
+        >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
+
+    """
+    # This turns out to be faster than unicode.translate()
+    return string.encode('utf8').lower().decode('utf8')
+
+
+def lookup(label):
+    """
+    Look for an encoding by its label.
+    This is the spec’s `get an encoding
+    <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
+    Supported labels are listed there.
+
+    :param label: A string.
+    :returns:
+        An :class:`Encoding` object, or :obj:`None` for an unknown label.
+
+    """
+    # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
+    label = ascii_lower(label.strip('\t\n\f\r '))
+    name = LABELS.get(label)
+    if name is None:
+        return None
+    encoding = CACHE.get(name)
+    if encoding is None:
+        if name == 'x-user-defined':
+            from .x_user_defined import codec_info
+        else:
+            python_name = PYTHON_NAMES.get(name, name)
+            # Any python_name value that gets to here should be valid.
+            codec_info = codecs.lookup(python_name)
+        encoding = Encoding(name, codec_info)
+        CACHE[name] = encoding
+    return encoding
+
+
+def _get_encoding(encoding_or_label):
+    """
+    Accept either an encoding object or label.
+
+    :param encoding: An :class:`Encoding` object or a label string.
+    :returns: An :class:`Encoding` object.
+    :raises: :exc:`~exceptions.LookupError` for an unknown label.
+
+    """
+    if hasattr(encoding_or_label, 'codec_info'):
+        return encoding_or_label
+
+    encoding = lookup(encoding_or_label)
+    if encoding is None:
+        raise LookupError('Unknown encoding label: %r' % encoding_or_label)
+    return encoding
+
+
+class Encoding(object):
+    """Reresents a character encoding such as UTF-8,
+    that can be used for decoding or encoding.
+
+    .. attribute:: name
+
+        Canonical name of the encoding
+
+    .. attribute:: codec_info
+
+        The actual implementation of the encoding,
+        a stdlib :class:`~codecs.CodecInfo` object.
+        See :func:`codecs.register`.
+
+    """
+    def __init__(self, name, codec_info):
+        self.name = name
+        self.codec_info = codec_info
+
+    def __repr__(self):
+        return '<Encoding %s>' % self.name
+
+
+#: The UTF-8 encoding. Should be used for new content and formats.
+UTF8 = lookup('utf-8')
+
+_UTF16LE = lookup('utf-16le')
+_UTF16BE = lookup('utf-16be')
+
+
+def decode(input, fallback_encoding, errors='replace'):
+    """
+    Decode a single string.
+
+    :param input: A byte string
+    :param fallback_encoding:
+        An :class:`Encoding` object or a label string.
+        The encoding to use if :obj:`input` does note have a BOM.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :return:
+        A ``(output, encoding)`` tuple of an Unicode string
+        and an :obj:`Encoding`.
+
+    """
+    # Fail early if `encoding` is an invalid label.
+    fallback_encoding = _get_encoding(fallback_encoding)
+    bom_encoding, input = _detect_bom(input)
+    encoding = bom_encoding or fallback_encoding
+    return encoding.codec_info.decode(input, errors)[0], encoding
+
+
+def _detect_bom(input):
+    """Return (bom_encoding, input), with any BOM removed from the input."""
+    if input.startswith(b'\xFF\xFE'):
+        return _UTF16LE, input[2:]
+    if input.startswith(b'\xFE\xFF'):
+        return _UTF16BE, input[2:]
+    if input.startswith(b'\xEF\xBB\xBF'):
+        return UTF8, input[3:]
+    return None, input
+
+
+def encode(input, encoding=UTF8, errors='strict'):
+    """
+    Encode a single string.
+
+    :param input: An Unicode string.
+    :param encoding: An :class:`Encoding` object or a label string.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :return: A byte string.
+
+    """
+    return _get_encoding(encoding).codec_info.encode(input, errors)[0]
+
+
+def iter_decode(input, fallback_encoding, errors='replace'):
+    """
+    "Pull"-based decoder.
+
+    :param input:
+        An iterable of byte strings.
+
+        The input is first consumed just enough to determine the encoding
+        based on the precense of a BOM,
+        then consumed on demand when the return value is.
+    :param fallback_encoding:
+        An :class:`Encoding` object or a label string.
+        The encoding to use if :obj:`input` does note have a BOM.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :returns:
+        An ``(output, encoding)`` tuple.
+        :obj:`output` is an iterable of Unicode strings,
+        :obj:`encoding` is the :obj:`Encoding` that is being used.
+
+    """
+
+    decoder = IncrementalDecoder(fallback_encoding, errors)
+    generator = _iter_decode_generator(input, decoder)
+    encoding = next(generator)
+    return generator, encoding
+
+
+def _iter_decode_generator(input, decoder):
+    """Return a generator that first yields the :obj:`Encoding`,
+    then yields output chukns as Unicode strings.
+
+    """
+    decode = decoder.decode
+    input = iter(input)
+    for chunck in input:
+        output = decode(chunck)
+        if output:
+            assert decoder.encoding is not None
+            yield decoder.encoding
+            yield output
+            break
+    else:
+        # Input exhausted without determining the encoding
+        output = decode(b'', final=True)
+        assert decoder.encoding is not None
+        yield decoder.encoding
+        if output:
+            yield output
+        return
+
+    for chunck in input:
+        output = decode(chunck)
+        if output:
+            yield output
+    output = decode(b'', final=True)
+    if output:
+        yield output
+
+
+def iter_encode(input, encoding=UTF8, errors='strict'):
+    """
+    “Pull”-based encoder.
+
+    :param input: An iterable of Unicode strings.
+    :param encoding: An :class:`Encoding` object or a label string.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :returns: An iterable of byte strings.
+
+    """
+    # Fail early if `encoding` is an invalid label.
+    encode = IncrementalEncoder(encoding, errors).encode
+    return _iter_encode_generator(input, encode)
+
+
+def _iter_encode_generator(input, encode):
+    for chunck in input:
+        output = encode(chunck)
+        if output:
+            yield output
+    output = encode('', final=True)
+    if output:
+        yield output
+
+
+class IncrementalDecoder(object):
+    """
+    “Push”-based decoder.
+
+    :param fallback_encoding:
+        An :class:`Encoding` object or a label string.
+        The encoding to use if :obj:`input` does note have a BOM.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+    """
+    def __init__(self, fallback_encoding, errors='replace'):
+        # Fail early if `encoding` is an invalid label.
+        self._fallback_encoding = _get_encoding(fallback_encoding)
+        self._errors = errors
+        self._buffer = b''
+        self._decoder = None
+        #: The actual :class:`Encoding` that is being used,
+        #: or :obj:`None` if that is not determined yet.
+        #: (Ie. if there is not enough input yet to determine
+        #: if there is a BOM.)
+        self.encoding = None  # Not known yet.
+
+    def decode(self, input, final=False):
+        """Decode one chunk of the input.
+
+        :param input: A byte string.
+        :param final:
+            Indicate that no more input is available.
+            Must be :obj:`True` if this is the last call.
+        :returns: An Unicode string.
+
+        """
+        decoder = self._decoder
+        if decoder is not None:
+            return decoder(input, final)
+
+        input = self._buffer + input
+        encoding, input = _detect_bom(input)
+        if encoding is None:
+            if len(input) < 3 and not final:  # Not enough data yet.
+                self._buffer = input
+                return ''
+            else:  # No BOM
+                encoding = self._fallback_encoding
+        decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
+        self._decoder = decoder
+        self.encoding = encoding
+        return decoder(input, final)
+
+
+class IncrementalEncoder(object):
+    """
+    “Push”-based encoder.
+
+    :param encoding: An :class:`Encoding` object or a label string.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+    .. method:: encode(input, final=False)
+
+        :param input: An Unicode string.
+        :param final:
+            Indicate that no more input is available.
+            Must be :obj:`True` if this is the last call.
+        :returns: A byte string.
+
+    """
+    def __init__(self, encoding=UTF8, errors='strict'):
+        encoding = _get_encoding(encoding)
+        self.encode = encoding.codec_info.incrementalencoder(errors).encode
--- a/lib/webencodings/labels.py
+++ b/lib/webencodings/labels.py
@ -0,0 +1,231 @@
+"""
+
+    webencodings.labels
+    ~~~~~~~~~~~~~~~~~~~
+
+    Map encoding labels to their name.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+    'unicode-1-1-utf-8':   'utf-8',
+    'utf-8':               'utf-8',
+    'utf8':                'utf-8',
+    '866':                 'ibm866',
+    'cp866':               'ibm866',
+    'csibm866':            'ibm866',
+    'ibm866':              'ibm866',
+    'csisolatin2':         'iso-8859-2',
+    'iso-8859-2':          'iso-8859-2',
+    'iso-ir-101':          'iso-8859-2',
+    'iso8859-2':           'iso-8859-2',
+    'iso88592':            'iso-8859-2',
+    'iso_8859-2':          'iso-8859-2',
+    'iso_8859-2:1987':     'iso-8859-2',
+    'l2':                  'iso-8859-2',
+    'latin2':              'iso-8859-2',
+    'csisolatin3':         'iso-8859-3',
+    'iso-8859-3':          'iso-8859-3',
+    'iso-ir-109':          'iso-8859-3',
+    'iso8859-3':           'iso-8859-3',
+    'iso88593':            'iso-8859-3',
+    'iso_8859-3':          'iso-8859-3',
+    'iso_8859-3:1988':     'iso-8859-3',
+    'l3':                  'iso-8859-3',
+    'latin3':              'iso-8859-3',
+    'csisolatin4':         'iso-8859-4',
+    'iso-8859-4':          'iso-8859-4',
+    'iso-ir-110':          'iso-8859-4',
+    'iso8859-4':           'iso-8859-4',
+    'iso88594':            'iso-8859-4',
+    'iso_8859-4':          'iso-8859-4',
+    'iso_8859-4:1988':     'iso-8859-4',
+    'l4':                  'iso-8859-4',
+    'latin4':              'iso-8859-4',
+    'csisolatincyrillic':  'iso-8859-5',
+    'cyrillic':            'iso-8859-5',
+    'iso-8859-5':          'iso-8859-5',
+    'iso-ir-144':          'iso-8859-5',
+    'iso8859-5':           'iso-8859-5',
+    'iso88595':            'iso-8859-5',
+    'iso_8859-5':          'iso-8859-5',
+    'iso_8859-5:1988':     'iso-8859-5',
+    'arabic':              'iso-8859-6',
+    'asmo-708':            'iso-8859-6',
+    'csiso88596e':         'iso-8859-6',
+    'csiso88596i':         'iso-8859-6',
+    'csisolatinarabic':    'iso-8859-6',
+    'ecma-114':            'iso-8859-6',
+    'iso-8859-6':          'iso-8859-6',
+    'iso-8859-6-e':        'iso-8859-6',
+    'iso-8859-6-i':        'iso-8859-6',
+    'iso-ir-127':          'iso-8859-6',
+    'iso8859-6':           'iso-8859-6',
+    'iso88596':            'iso-8859-6',
+    'iso_8859-6':          'iso-8859-6',
+    'iso_8859-6:1987':     'iso-8859-6',
+    'csisolatingreek':     'iso-8859-7',
+    'ecma-118':            'iso-8859-7',
+    'elot_928':            'iso-8859-7',
+    'greek':               'iso-8859-7',
+    'greek8':              'iso-8859-7',
+    'iso-8859-7':          'iso-8859-7',
+    'iso-ir-126':          'iso-8859-7',
+    'iso8859-7':           'iso-8859-7',
+    'iso88597':            'iso-8859-7',
+    'iso_8859-7':          'iso-8859-7',
+    'iso_8859-7:1987':     'iso-8859-7',
+    'sun_eu_greek':        'iso-8859-7',
+    'csiso88598e':         'iso-8859-8',
+    'csisolatinhebrew':    'iso-8859-8',
+    'hebrew':              'iso-8859-8',
+    'iso-8859-8':          'iso-8859-8',
+    'iso-8859-8-e':        'iso-8859-8',
+    'iso-ir-138':          'iso-8859-8',
+    'iso8859-8':           'iso-8859-8',
+    'iso88598':            'iso-8859-8',
+    'iso_8859-8':          'iso-8859-8',
+    'iso_8859-8:1988':     'iso-8859-8',
+    'visual':              'iso-8859-8',
+    'csiso88598i':         'iso-8859-8-i',
+    'iso-8859-8-i':        'iso-8859-8-i',
+    'logical':             'iso-8859-8-i',
+    'csisolatin6':         'iso-8859-10',
+    'iso-8859-10':         'iso-8859-10',
+    'iso-ir-157':          'iso-8859-10',
+    'iso8859-10':          'iso-8859-10',
+    'iso885910':           'iso-8859-10',
+    'l6':                  'iso-8859-10',
+    'latin6':              'iso-8859-10',
+    'iso-8859-13':         'iso-8859-13',
+    'iso8859-13':          'iso-8859-13',
+    'iso885913':           'iso-8859-13',
+    'iso-8859-14':         'iso-8859-14',
+    'iso8859-14':          'iso-8859-14',
+    'iso885914':           'iso-8859-14',
+    'csisolatin9':         'iso-8859-15',
+    'iso-8859-15':         'iso-8859-15',
+    'iso8859-15':          'iso-8859-15',
+    'iso885915':           'iso-8859-15',
+    'iso_8859-15':         'iso-8859-15',
+    'l9':                  'iso-8859-15',
+    'iso-8859-16':         'iso-8859-16',
+    'cskoi8r':             'koi8-r',
+    'koi':                 'koi8-r',
+    'koi8':                'koi8-r',
+    'koi8-r':              'koi8-r',
+    'koi8_r':              'koi8-r',
+    'koi8-u':              'koi8-u',
+    'csmacintosh':         'macintosh',
+    'mac':                 'macintosh',
+    'macintosh':           'macintosh',
+    'x-mac-roman':         'macintosh',
+    'dos-874':             'windows-874',
+    'iso-8859-11':         'windows-874',
+    'iso8859-11':          'windows-874',
+    'iso885911':           'windows-874',
+    'tis-620':             'windows-874',
+    'windows-874':         'windows-874',
+    'cp1250':              'windows-1250',
+    'windows-1250':        'windows-1250',
+    'x-cp1250':            'windows-1250',
+    'cp1251':              'windows-1251',
+    'windows-1251':        'windows-1251',
+    'x-cp1251':            'windows-1251',
+    'ansi_x3.4-1968':      'windows-1252',
+    'ascii':               'windows-1252',
+    'cp1252':              'windows-1252',
+    'cp819':               'windows-1252',
+    'csisolatin1':         'windows-1252',
+    'ibm819':              'windows-1252',
+    'iso-8859-1':          'windows-1252',
+    'iso-ir-100':          'windows-1252',
+    'iso8859-1':           'windows-1252',
+    'iso88591':            'windows-1252',
+    'iso_8859-1':          'windows-1252',
+    'iso_8859-1:1987':     'windows-1252',
+    'l1':                  'windows-1252',
+    'latin1':              'windows-1252',
+    'us-ascii':            'windows-1252',
+    'windows-1252':        'windows-1252',
+    'x-cp1252':            'windows-1252',
+    'cp1253':              'windows-1253',
+    'windows-1253':        'windows-1253',
+    'x-cp1253':            'windows-1253',
+    'cp1254':              'windows-1254',
+    'csisolatin5':         'windows-1254',
+    'iso-8859-9':          'windows-1254',
+    'iso-ir-148':          'windows-1254',
+    'iso8859-9':           'windows-1254',
+    'iso88599':            'windows-1254',
+    'iso_8859-9':          'windows-1254',
+    'iso_8859-9:1989':     'windows-1254',
+    'l5':                  'windows-1254',
+    'latin5':              'windows-1254',
+    'windows-1254':        'windows-1254',
+    'x-cp1254':            'windows-1254',
+    'cp1255':              'windows-1255',
+    'windows-1255':        'windows-1255',
+    'x-cp1255':            'windows-1255',
+    'cp1256':              'windows-1256',
+    'windows-1256':        'windows-1256',
+    'x-cp1256':            'windows-1256',
+    'cp1257':              'windows-1257',
+    'windows-1257':        'windows-1257',
+    'x-cp1257':            'windows-1257',
+    'cp1258':              'windows-1258',
+    'windows-1258':        'windows-1258',
+    'x-cp1258':            'windows-1258',
+    'x-mac-cyrillic':      'x-mac-cyrillic',
+    'x-mac-ukrainian':     'x-mac-cyrillic',
+    'chinese':             'gbk',
+    'csgb2312':            'gbk',
+    'csiso58gb231280':     'gbk',
+    'gb2312':              'gbk',
+    'gb_2312':             'gbk',
+    'gb_2312-80':          'gbk',
+    'gbk':                 'gbk',
+    'iso-ir-58':           'gbk',
+    'x-gbk':               'gbk',
+    'gb18030':             'gb18030',
+    'hz-gb-2312':          'hz-gb-2312',
+    'big5':                'big5',
+    'big5-hkscs':          'big5',
+    'cn-big5':             'big5',
+    'csbig5':              'big5',
+    'x-x-big5':            'big5',
+    'cseucpkdfmtjapanese': 'euc-jp',
+    'euc-jp':              'euc-jp',
+    'x-euc-jp':            'euc-jp',
+    'csiso2022jp':         'iso-2022-jp',
+    'iso-2022-jp':         'iso-2022-jp',
+    'csshiftjis':          'shift_jis',
+    'ms_kanji':            'shift_jis',
+    'shift-jis':           'shift_jis',
+    'shift_jis':           'shift_jis',
+    'sjis':                'shift_jis',
+    'windows-31j':         'shift_jis',
+    'x-sjis':              'shift_jis',
+    'cseuckr':             'euc-kr',
+    'csksc56011987':       'euc-kr',
+    'euc-kr':              'euc-kr',
+    'iso-ir-149':          'euc-kr',
+    'korean':              'euc-kr',
+    'ks_c_5601-1987':      'euc-kr',
+    'ks_c_5601-1989':      'euc-kr',
+    'ksc5601':             'euc-kr',
+    'ksc_5601':            'euc-kr',
+    'windows-949':         'euc-kr',
+    'csiso2022kr':         'iso-2022-kr',
+    'iso-2022-kr':         'iso-2022-kr',
+    'utf-16be':            'utf-16be',
+    'utf-16':              'utf-16le',
+    'utf-16le':            'utf-16le',
+    'x-user-defined':      'x-user-defined',
+}
--- a/lib/webencodings/mklabels.py
+++ b/lib/webencodings/mklabels.py
@ -0,0 +1,59 @@
+"""
+
+    webencodings.mklabels
+    ~~~~~~~~~~~~~~~~~~~~~
+
+    Regenarate the webencodings.labels module.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+import json
+try:
+    from urllib import urlopen
+except ImportError:
+    from urllib.request import urlopen
+
+
+def assert_lower(string):
+    assert string == string.lower()
+    return string
+
+
+def generate(url):
+    parts = ['''\
+"""
+
+    webencodings.labels
+    ~~~~~~~~~~~~~~~~~~~
+
+    Map encoding labels to their name.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+''']
+    labels = [
+        (repr(assert_lower(label)).lstrip('u'),
+         repr(encoding['name']).lstrip('u'))
+        for category in json.loads(urlopen(url).read().decode('ascii'))
+        for encoding in category['encodings']
+        for label in encoding['labels']]
+    max_len = max(len(label) for label, name in labels)
+    parts.extend(
+        '    %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
+        for label, name in labels)
+    parts.append('}')
+    return ''.join(parts)
+
+
+if __name__ == '__main__':
+    print(generate('http://encoding.spec.whatwg.org/encodings.json'))
--- a/lib/webencodings/tests.py
+++ b/lib/webencodings/tests.py
@ -0,0 +1,153 @@
+# coding: utf8
+"""
+
+    webencodings.tests
+    ~~~~~~~~~~~~~~~~~~
+
+    A basic test suite for Encoding.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
+               IncrementalDecoder, IncrementalEncoder, UTF8)
+
+
+def assert_raises(exception, function, *args, **kwargs):
+    try:
+        function(*args, **kwargs)
+    except exception:
+        return
+    else:  # pragma: no cover
+        raise AssertionError('Did not raise %s.' % exception)
+
+
+def test_labels():
+    assert lookup('utf-8').name == 'utf-8'
+    assert lookup('Utf-8').name == 'utf-8'
+    assert lookup('UTF-8').name == 'utf-8'
+    assert lookup('utf8').name == 'utf-8'
+    assert lookup('utf8').name == 'utf-8'
+    assert lookup('utf8 ').name == 'utf-8'
+    assert lookup(' \r\nutf8\t').name == 'utf-8'
+    assert lookup('u8') is None  # Python label.
+    assert lookup('utf-8 ') is None  # Non-ASCII white space.
+
+    assert lookup('US-ASCII').name == 'windows-1252'
+    assert lookup('iso-8859-1').name == 'windows-1252'
+    assert lookup('latin1').name == 'windows-1252'
+    assert lookup('LATIN1').name == 'windows-1252'
+    assert lookup('latin-1') is None
+    assert lookup('LATİN1') is None  # ASCII-only case insensitivity.
+
+
+def test_all_labels():
+    for label in LABELS:
+        assert decode(b'', label) == ('', lookup(label))
+        assert encode('', label) == b''
+        for repeat in [0, 1, 12]:
+            output, _ = iter_decode([b''] * repeat, label)
+            assert list(output) == []
+            assert list(iter_encode([''] * repeat, label)) == []
+        decoder = IncrementalDecoder(label)
+        assert decoder.decode(b'') == ''
+        assert decoder.decode(b'', final=True) == ''
+        encoder = IncrementalEncoder(label)
+        assert encoder.encode('') == b''
+        assert encoder.encode('', final=True) == b''
+    # All encoding names are valid labels too:
+    for name in set(LABELS.values()):
+        assert lookup(name).name == name
+
+
+def test_invalid_label():
+    assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
+    assert_raises(LookupError, encode, 'é', 'invalid')
+    assert_raises(LookupError, iter_decode, [], 'invalid')
+    assert_raises(LookupError, iter_encode, [], 'invalid')
+    assert_raises(LookupError, IncrementalDecoder, 'invalid')
+    assert_raises(LookupError, IncrementalEncoder, 'invalid')
+
+
+def test_decode():
+    assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
+    assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
+    assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
+    assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
+    assert decode(b'\xc3\xa9', 'ascii') == ('Ã©', lookup('ascii'))
+    assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8'))  # UTF-8 with BOM
+
+    assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be'))  # UTF-16-BE with BOM
+    assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le'))  # UTF-16-LE with BOM
+    assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
+    assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
+
+    assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
+    assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
+    assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
+
+    assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
+    assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
+    assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
+
+
+def test_encode():
+    assert encode('é', 'latin1') == b'\xe9'
+    assert encode('é', 'utf8') == b'\xc3\xa9'
+    assert encode('é', 'utf8') == b'\xc3\xa9'
+    assert encode('é', 'utf-16') == b'\xe9\x00'
+    assert encode('é', 'utf-16le') == b'\xe9\x00'
+    assert encode('é', 'utf-16be') == b'\x00\xe9'
+
+
+def test_iter_decode():
+    def iter_decode_to_string(input, fallback_encoding):
+        output, _encoding = iter_decode(input, fallback_encoding)
+        return ''.join(output)
+    assert iter_decode_to_string([], 'latin1') == ''
+    assert iter_decode_to_string([b''], 'latin1') == ''
+    assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
+    assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
+    assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
+    assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
+    assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'Ã©'
+    assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
+    assert iter_decode_to_string([
+        b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
+    assert iter_decode_to_string([
+        b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
+    assert iter_decode_to_string([
+        b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
+    assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
+    assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
+    assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
+    assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
+    assert iter_decode_to_string([
+        b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
+    assert iter_decode_to_string([
+        b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
+
+
+def test_iter_encode():
+    assert b''.join(iter_encode([], 'latin1')) == b''
+    assert b''.join(iter_encode([''], 'latin1')) == b''
+    assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
+    assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
+    assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
+    assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
+    assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
+    assert b''.join(iter_encode([
+        '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
+
+
+def test_x_user_defined():
+    encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
+    decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
+    encoded = b'aa'
+    decoded = 'aa'
+    assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
+    assert encode(decoded, 'x-user-defined') == encoded
--- a/lib/webencodings/x_user_defined.py
+++ b/lib/webencodings/x_user_defined.py
@ -0,0 +1,325 @@
+# coding: utf8
+"""
+
+    webencodings.x_user_defined
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    An implementation of the x-user-defined encoding.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+    def encode(self, input, errors='strict'):
+        return codecs.charmap_encode(input, errors, encoding_table)
+
+    def decode(self, input, errors='strict'):
+        return codecs.charmap_decode(input, errors, decoding_table)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def encode(self, input, final=False):
+        return codecs.charmap_encode(input, self.errors, encoding_table)[0]
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+    def decode(self, input, final=False):
+        return codecs.charmap_decode(input, self.errors, decoding_table)[0]
+
+
+class StreamWriter(Codec, codecs.StreamWriter):
+    pass
+
+
+class StreamReader(Codec, codecs.StreamReader):
+    pass
+
+
+### encodings module API
+
+codec_info = codecs.CodecInfo(
+    name='x-user-defined',
+    encode=Codec().encode,
+    decode=Codec().decode,
+    incrementalencoder=IncrementalEncoder,
+    incrementaldecoder=IncrementalDecoder,
+    streamreader=StreamReader,
+    streamwriter=StreamWriter,
+)
+
+
+### Decoding Table
+
+# Python 3:
+# for c in range(256): print('    %r' % chr(c if c < 128 else c + 0xF700))
+decoding_table = (
+    '\x00'
+    '\x01'
+    '\x02'
+    '\x03'
+    '\x04'
+    '\x05'
+    '\x06'
+    '\x07'
+    '\x08'
+    '\t'
+    '\n'
+    '\x0b'
+    '\x0c'
+    '\r'
+    '\x0e'
+    '\x0f'
+    '\x10'
+    '\x11'
+    '\x12'
+    '\x13'
+    '\x14'
+    '\x15'
+    '\x16'
+    '\x17'
+    '\x18'
+    '\x19'
+    '\x1a'
+    '\x1b'
+    '\x1c'
+    '\x1d'
+    '\x1e'
+    '\x1f'
+    ' '
+    '!'
+    '"'
+    '#'
+    '$'
+    '%'
+    '&'
+    "'"
+    '('
+    ')'
+    '*'
+    '+'
+    ','
+    '-'
+    '.'
+    '/'
+    '0'
+    '1'
+    '2'
+    '3'
+    '4'
+    '5'
+    '6'
+    '7'
+    '8'
+    '9'
+    ':'
+    ';'
+    '<'
+    '='
+    '>'
+    '?'
+    '@'
+    'A'
+    'B'
+    'C'
+    'D'
+    'E'
+    'F'
+    'G'
+    'H'
+    'I'
+    'J'
+    'K'
+    'L'
+    'M'
+    'N'
+    'O'
+    'P'
+    'Q'
+    'R'
+    'S'
+    'T'
+    'U'
+    'V'
+    'W'
+    'X'
+    'Y'
+    'Z'
+    '['
+    '\\'
+    ']'
+    '^'
+    '_'
+    '`'
+    'a'
+    'b'
+    'c'
+    'd'
+    'e'
+    'f'
+    'g'
+    'h'
+    'i'
+    'j'
+    'k'
+    'l'
+    'm'
+    'n'
+    'o'
+    'p'
+    'q'
+    'r'
+    's'
+    't'
+    'u'
+    'v'
+    'w'
+    'x'
+    'y'
+    'z'
+    '{'
+    '|'
+    '}'
+    '~'
+    '\x7f'
+    '\uf780'
+    '\uf781'
+    '\uf782'
+    '\uf783'
+    '\uf784'
+    '\uf785'
+    '\uf786'
+    '\uf787'
+    '\uf788'
+    '\uf789'
+    '\uf78a'
+    '\uf78b'
+    '\uf78c'
+    '\uf78d'
+    '\uf78e'
+    '\uf78f'
+    '\uf790'
+    '\uf791'
+    '\uf792'
+    '\uf793'
+    '\uf794'
+    '\uf795'
+    '\uf796'
+    '\uf797'
+    '\uf798'
+    '\uf799'
+    '\uf79a'
+    '\uf79b'
+    '\uf79c'
+    '\uf79d'
+    '\uf79e'
+    '\uf79f'
+    '\uf7a0'
+    '\uf7a1'
+    '\uf7a2'
+    '\uf7a3'
+    '\uf7a4'
+    '\uf7a5'
+    '\uf7a6'
+    '\uf7a7'
+    '\uf7a8'
+    '\uf7a9'
+    '\uf7aa'
+    '\uf7ab'
+    '\uf7ac'
+    '\uf7ad'
+    '\uf7ae'
+    '\uf7af'
+    '\uf7b0'
+    '\uf7b1'
+    '\uf7b2'
+    '\uf7b3'
+    '\uf7b4'
+    '\uf7b5'
+    '\uf7b6'
+    '\uf7b7'
+    '\uf7b8'
+    '\uf7b9'
+    '\uf7ba'
+    '\uf7bb'
+    '\uf7bc'
+    '\uf7bd'
+    '\uf7be'
+    '\uf7bf'
+    '\uf7c0'
+    '\uf7c1'
+    '\uf7c2'
+    '\uf7c3'
+    '\uf7c4'
+    '\uf7c5'
+    '\uf7c6'
+    '\uf7c7'
+    '\uf7c8'
+    '\uf7c9'
+    '\uf7ca'
+    '\uf7cb'
+    '\uf7cc'
+    '\uf7cd'
+    '\uf7ce'
+    '\uf7cf'
+    '\uf7d0'
+    '\uf7d1'
+    '\uf7d2'
+    '\uf7d3'
+    '\uf7d4'
+    '\uf7d5'
+    '\uf7d6'
+    '\uf7d7'
+    '\uf7d8'
+    '\uf7d9'
+    '\uf7da'
+    '\uf7db'
+    '\uf7dc'
+    '\uf7dd'
+    '\uf7de'
+    '\uf7df'
+    '\uf7e0'
+    '\uf7e1'
+    '\uf7e2'
+    '\uf7e3'
+    '\uf7e4'
+    '\uf7e5'
+    '\uf7e6'
+    '\uf7e7'
+    '\uf7e8'
+    '\uf7e9'
+    '\uf7ea'
+    '\uf7eb'
+    '\uf7ec'
+    '\uf7ed'
+    '\uf7ee'
+    '\uf7ef'
+    '\uf7f0'
+    '\uf7f1'
+    '\uf7f2'
+    '\uf7f3'
+    '\uf7f4'
+    '\uf7f5'
+    '\uf7f6'
+    '\uf7f7'
+    '\uf7f8'
+    '\uf7f9'
+    '\uf7fa'
+    '\uf7fb'
+    '\uf7fc'
+    '\uf7fd'
+    '\uf7fe'
+    '\uf7ff'
+)
+
+### Encoding table
+encoding_table = codecs.charmap_build(decoding_table)