Merge pull request #899 from JackDandy/feature/AddWebencodings

Add webencodings 0.5 (3970651) to assist parsing legacy web content.
This commit is contained in:
JackDandy 2017-02-01 13:32:38 +00:00 committed by GitHub
commit e75a5d7c34
6 changed files with 1111 additions and 0 deletions

View file

@ -6,6 +6,7 @@
* Change improve page load time when loading images
* Update isotope library 2.2.2 to 3.0.1
* Add lazyload package 3.0.0 (2e318b1)
* Add webencodings 0.5 (3970651) to assist parsing legacy web content
* Change improve add show search results by comparing search term to an additional unidecoded result set
* Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
* Update backports_abc 0.4 to 0.5

View file

@ -0,0 +1,342 @@
# coding: utf8
"""
webencodings
~~~~~~~~~~~~
This is a Python implementation of the `WHATWG Encoding standard
<http://encoding.spec.whatwg.org/>`. See README for details.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
from __future__ import unicode_literals
import codecs
from .labels import LABELS
VERSION = '0.5'
# Some names in Encoding are not valid Python aliases. Remap these.
PYTHON_NAMES = {
'iso-8859-8-i': 'iso-8859-8',
'x-mac-cyrillic': 'mac-cyrillic',
'macintosh': 'mac-roman',
'windows-874': 'cp874'}
CACHE = {}
def ascii_lower(string):
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
:param string: An Unicode string.
:returns: A new Unicode string.
This is used for `ASCII case-insensitive
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
matching of encoding labels.
The same matching is also used, among other things,
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
This is different from the :meth:`~py:str.lower` method of Unicode strings
which also affect non-ASCII characters,
sometimes mapping them into the ASCII range:
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
>>> assert keyword.lower() == u'background'
>>> assert ascii_lower(keyword) != keyword.lower()
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
"""
# This turns out to be faster than unicode.translate()
return string.encode('utf8').lower().decode('utf8')
def lookup(label):
"""
Look for an encoding by its label.
This is the specs `get an encoding
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
Supported labels are listed there.
:param label: A string.
:returns:
An :class:`Encoding` object, or :obj:`None` for an unknown label.
"""
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
label = ascii_lower(label.strip('\t\n\f\r '))
name = LABELS.get(label)
if name is None:
return None
encoding = CACHE.get(name)
if encoding is None:
if name == 'x-user-defined':
from .x_user_defined import codec_info
else:
python_name = PYTHON_NAMES.get(name, name)
# Any python_name value that gets to here should be valid.
codec_info = codecs.lookup(python_name)
encoding = Encoding(name, codec_info)
CACHE[name] = encoding
return encoding
def _get_encoding(encoding_or_label):
"""
Accept either an encoding object or label.
:param encoding: An :class:`Encoding` object or a label string.
:returns: An :class:`Encoding` object.
:raises: :exc:`~exceptions.LookupError` for an unknown label.
"""
if hasattr(encoding_or_label, 'codec_info'):
return encoding_or_label
encoding = lookup(encoding_or_label)
if encoding is None:
raise LookupError('Unknown encoding label: %r' % encoding_or_label)
return encoding
class Encoding(object):
"""Reresents a character encoding such as UTF-8,
that can be used for decoding or encoding.
.. attribute:: name
Canonical name of the encoding
.. attribute:: codec_info
The actual implementation of the encoding,
a stdlib :class:`~codecs.CodecInfo` object.
See :func:`codecs.register`.
"""
def __init__(self, name, codec_info):
self.name = name
self.codec_info = codec_info
def __repr__(self):
return '<Encoding %s>' % self.name
#: The UTF-8 encoding. Should be used for new content and formats.
UTF8 = lookup('utf-8')
_UTF16LE = lookup('utf-16le')
_UTF16BE = lookup('utf-16be')
def decode(input, fallback_encoding, errors='replace'):
"""
Decode a single string.
:param input: A byte string
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return:
A ``(output, encoding)`` tuple of an Unicode string
and an :obj:`Encoding`.
"""
# Fail early if `encoding` is an invalid label.
fallback_encoding = _get_encoding(fallback_encoding)
bom_encoding, input = _detect_bom(input)
encoding = bom_encoding or fallback_encoding
return encoding.codec_info.decode(input, errors)[0], encoding
def _detect_bom(input):
"""Return (bom_encoding, input), with any BOM removed from the input."""
if input.startswith(b'\xFF\xFE'):
return _UTF16LE, input[2:]
if input.startswith(b'\xFE\xFF'):
return _UTF16BE, input[2:]
if input.startswith(b'\xEF\xBB\xBF'):
return UTF8, input[3:]
return None, input
def encode(input, encoding=UTF8, errors='strict'):
"""
Encode a single string.
:param input: An Unicode string.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return: A byte string.
"""
return _get_encoding(encoding).codec_info.encode(input, errors)[0]
def iter_decode(input, fallback_encoding, errors='replace'):
"""
"Pull"-based decoder.
:param input:
An iterable of byte strings.
The input is first consumed just enough to determine the encoding
based on the precense of a BOM,
then consumed on demand when the return value is.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns:
An ``(output, encoding)`` tuple.
:obj:`output` is an iterable of Unicode strings,
:obj:`encoding` is the :obj:`Encoding` that is being used.
"""
decoder = IncrementalDecoder(fallback_encoding, errors)
generator = _iter_decode_generator(input, decoder)
encoding = next(generator)
return generator, encoding
def _iter_decode_generator(input, decoder):
"""Return a generator that first yields the :obj:`Encoding`,
then yields output chukns as Unicode strings.
"""
decode = decoder.decode
input = iter(input)
for chunck in input:
output = decode(chunck)
if output:
assert decoder.encoding is not None
yield decoder.encoding
yield output
break
else:
# Input exhausted without determining the encoding
output = decode(b'', final=True)
assert decoder.encoding is not None
yield decoder.encoding
if output:
yield output
return
for chunck in input:
output = decode(chunck)
if output:
yield output
output = decode(b'', final=True)
if output:
yield output
def iter_encode(input, encoding=UTF8, errors='strict'):
"""
Pull-based encoder.
:param input: An iterable of Unicode strings.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns: An iterable of byte strings.
"""
# Fail early if `encoding` is an invalid label.
encode = IncrementalEncoder(encoding, errors).encode
return _iter_encode_generator(input, encode)
def _iter_encode_generator(input, encode):
for chunck in input:
output = encode(chunck)
if output:
yield output
output = encode('', final=True)
if output:
yield output
class IncrementalDecoder(object):
"""
Push-based decoder.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
"""
def __init__(self, fallback_encoding, errors='replace'):
# Fail early if `encoding` is an invalid label.
self._fallback_encoding = _get_encoding(fallback_encoding)
self._errors = errors
self._buffer = b''
self._decoder = None
#: The actual :class:`Encoding` that is being used,
#: or :obj:`None` if that is not determined yet.
#: (Ie. if there is not enough input yet to determine
#: if there is a BOM.)
self.encoding = None # Not known yet.
def decode(self, input, final=False):
"""Decode one chunk of the input.
:param input: A byte string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: An Unicode string.
"""
decoder = self._decoder
if decoder is not None:
return decoder(input, final)
input = self._buffer + input
encoding, input = _detect_bom(input)
if encoding is None:
if len(input) < 3 and not final: # Not enough data yet.
self._buffer = input
return ''
else: # No BOM
encoding = self._fallback_encoding
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
self._decoder = decoder
self.encoding = encoding
return decoder(input, final)
class IncrementalEncoder(object):
"""
Push-based encoder.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
.. method:: encode(input, final=False)
:param input: An Unicode string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: A byte string.
"""
def __init__(self, encoding=UTF8, errors='strict'):
encoding = _get_encoding(encoding)
self.encode = encoding.codec_info.incrementalencoder(errors).encode

231
lib/webencodings/labels.py Normal file
View file

@ -0,0 +1,231 @@
"""
webencodings.labels
~~~~~~~~~~~~~~~~~~~
Map encoding labels to their name.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
# XXX Do not edit!
# This file is automatically generated by mklabels.py
LABELS = {
'unicode-1-1-utf-8': 'utf-8',
'utf-8': 'utf-8',
'utf8': 'utf-8',
'866': 'ibm866',
'cp866': 'ibm866',
'csibm866': 'ibm866',
'ibm866': 'ibm866',
'csisolatin2': 'iso-8859-2',
'iso-8859-2': 'iso-8859-2',
'iso-ir-101': 'iso-8859-2',
'iso8859-2': 'iso-8859-2',
'iso88592': 'iso-8859-2',
'iso_8859-2': 'iso-8859-2',
'iso_8859-2:1987': 'iso-8859-2',
'l2': 'iso-8859-2',
'latin2': 'iso-8859-2',
'csisolatin3': 'iso-8859-3',
'iso-8859-3': 'iso-8859-3',
'iso-ir-109': 'iso-8859-3',
'iso8859-3': 'iso-8859-3',
'iso88593': 'iso-8859-3',
'iso_8859-3': 'iso-8859-3',
'iso_8859-3:1988': 'iso-8859-3',
'l3': 'iso-8859-3',
'latin3': 'iso-8859-3',
'csisolatin4': 'iso-8859-4',
'iso-8859-4': 'iso-8859-4',
'iso-ir-110': 'iso-8859-4',
'iso8859-4': 'iso-8859-4',
'iso88594': 'iso-8859-4',
'iso_8859-4': 'iso-8859-4',
'iso_8859-4:1988': 'iso-8859-4',
'l4': 'iso-8859-4',
'latin4': 'iso-8859-4',
'csisolatincyrillic': 'iso-8859-5',
'cyrillic': 'iso-8859-5',
'iso-8859-5': 'iso-8859-5',
'iso-ir-144': 'iso-8859-5',
'iso8859-5': 'iso-8859-5',
'iso88595': 'iso-8859-5',
'iso_8859-5': 'iso-8859-5',
'iso_8859-5:1988': 'iso-8859-5',
'arabic': 'iso-8859-6',
'asmo-708': 'iso-8859-6',
'csiso88596e': 'iso-8859-6',
'csiso88596i': 'iso-8859-6',
'csisolatinarabic': 'iso-8859-6',
'ecma-114': 'iso-8859-6',
'iso-8859-6': 'iso-8859-6',
'iso-8859-6-e': 'iso-8859-6',
'iso-8859-6-i': 'iso-8859-6',
'iso-ir-127': 'iso-8859-6',
'iso8859-6': 'iso-8859-6',
'iso88596': 'iso-8859-6',
'iso_8859-6': 'iso-8859-6',
'iso_8859-6:1987': 'iso-8859-6',
'csisolatingreek': 'iso-8859-7',
'ecma-118': 'iso-8859-7',
'elot_928': 'iso-8859-7',
'greek': 'iso-8859-7',
'greek8': 'iso-8859-7',
'iso-8859-7': 'iso-8859-7',
'iso-ir-126': 'iso-8859-7',
'iso8859-7': 'iso-8859-7',
'iso88597': 'iso-8859-7',
'iso_8859-7': 'iso-8859-7',
'iso_8859-7:1987': 'iso-8859-7',
'sun_eu_greek': 'iso-8859-7',
'csiso88598e': 'iso-8859-8',
'csisolatinhebrew': 'iso-8859-8',
'hebrew': 'iso-8859-8',
'iso-8859-8': 'iso-8859-8',
'iso-8859-8-e': 'iso-8859-8',
'iso-ir-138': 'iso-8859-8',
'iso8859-8': 'iso-8859-8',
'iso88598': 'iso-8859-8',
'iso_8859-8': 'iso-8859-8',
'iso_8859-8:1988': 'iso-8859-8',
'visual': 'iso-8859-8',
'csiso88598i': 'iso-8859-8-i',
'iso-8859-8-i': 'iso-8859-8-i',
'logical': 'iso-8859-8-i',
'csisolatin6': 'iso-8859-10',
'iso-8859-10': 'iso-8859-10',
'iso-ir-157': 'iso-8859-10',
'iso8859-10': 'iso-8859-10',
'iso885910': 'iso-8859-10',
'l6': 'iso-8859-10',
'latin6': 'iso-8859-10',
'iso-8859-13': 'iso-8859-13',
'iso8859-13': 'iso-8859-13',
'iso885913': 'iso-8859-13',
'iso-8859-14': 'iso-8859-14',
'iso8859-14': 'iso-8859-14',
'iso885914': 'iso-8859-14',
'csisolatin9': 'iso-8859-15',
'iso-8859-15': 'iso-8859-15',
'iso8859-15': 'iso-8859-15',
'iso885915': 'iso-8859-15',
'iso_8859-15': 'iso-8859-15',
'l9': 'iso-8859-15',
'iso-8859-16': 'iso-8859-16',
'cskoi8r': 'koi8-r',
'koi': 'koi8-r',
'koi8': 'koi8-r',
'koi8-r': 'koi8-r',
'koi8_r': 'koi8-r',
'koi8-u': 'koi8-u',
'csmacintosh': 'macintosh',
'mac': 'macintosh',
'macintosh': 'macintosh',
'x-mac-roman': 'macintosh',
'dos-874': 'windows-874',
'iso-8859-11': 'windows-874',
'iso8859-11': 'windows-874',
'iso885911': 'windows-874',
'tis-620': 'windows-874',
'windows-874': 'windows-874',
'cp1250': 'windows-1250',
'windows-1250': 'windows-1250',
'x-cp1250': 'windows-1250',
'cp1251': 'windows-1251',
'windows-1251': 'windows-1251',
'x-cp1251': 'windows-1251',
'ansi_x3.4-1968': 'windows-1252',
'ascii': 'windows-1252',
'cp1252': 'windows-1252',
'cp819': 'windows-1252',
'csisolatin1': 'windows-1252',
'ibm819': 'windows-1252',
'iso-8859-1': 'windows-1252',
'iso-ir-100': 'windows-1252',
'iso8859-1': 'windows-1252',
'iso88591': 'windows-1252',
'iso_8859-1': 'windows-1252',
'iso_8859-1:1987': 'windows-1252',
'l1': 'windows-1252',
'latin1': 'windows-1252',
'us-ascii': 'windows-1252',
'windows-1252': 'windows-1252',
'x-cp1252': 'windows-1252',
'cp1253': 'windows-1253',
'windows-1253': 'windows-1253',
'x-cp1253': 'windows-1253',
'cp1254': 'windows-1254',
'csisolatin5': 'windows-1254',
'iso-8859-9': 'windows-1254',
'iso-ir-148': 'windows-1254',
'iso8859-9': 'windows-1254',
'iso88599': 'windows-1254',
'iso_8859-9': 'windows-1254',
'iso_8859-9:1989': 'windows-1254',
'l5': 'windows-1254',
'latin5': 'windows-1254',
'windows-1254': 'windows-1254',
'x-cp1254': 'windows-1254',
'cp1255': 'windows-1255',
'windows-1255': 'windows-1255',
'x-cp1255': 'windows-1255',
'cp1256': 'windows-1256',
'windows-1256': 'windows-1256',
'x-cp1256': 'windows-1256',
'cp1257': 'windows-1257',
'windows-1257': 'windows-1257',
'x-cp1257': 'windows-1257',
'cp1258': 'windows-1258',
'windows-1258': 'windows-1258',
'x-cp1258': 'windows-1258',
'x-mac-cyrillic': 'x-mac-cyrillic',
'x-mac-ukrainian': 'x-mac-cyrillic',
'chinese': 'gbk',
'csgb2312': 'gbk',
'csiso58gb231280': 'gbk',
'gb2312': 'gbk',
'gb_2312': 'gbk',
'gb_2312-80': 'gbk',
'gbk': 'gbk',
'iso-ir-58': 'gbk',
'x-gbk': 'gbk',
'gb18030': 'gb18030',
'hz-gb-2312': 'hz-gb-2312',
'big5': 'big5',
'big5-hkscs': 'big5',
'cn-big5': 'big5',
'csbig5': 'big5',
'x-x-big5': 'big5',
'cseucpkdfmtjapanese': 'euc-jp',
'euc-jp': 'euc-jp',
'x-euc-jp': 'euc-jp',
'csiso2022jp': 'iso-2022-jp',
'iso-2022-jp': 'iso-2022-jp',
'csshiftjis': 'shift_jis',
'ms_kanji': 'shift_jis',
'shift-jis': 'shift_jis',
'shift_jis': 'shift_jis',
'sjis': 'shift_jis',
'windows-31j': 'shift_jis',
'x-sjis': 'shift_jis',
'cseuckr': 'euc-kr',
'csksc56011987': 'euc-kr',
'euc-kr': 'euc-kr',
'iso-ir-149': 'euc-kr',
'korean': 'euc-kr',
'ks_c_5601-1987': 'euc-kr',
'ks_c_5601-1989': 'euc-kr',
'ksc5601': 'euc-kr',
'ksc_5601': 'euc-kr',
'windows-949': 'euc-kr',
'csiso2022kr': 'iso-2022-kr',
'iso-2022-kr': 'iso-2022-kr',
'utf-16be': 'utf-16be',
'utf-16': 'utf-16le',
'utf-16le': 'utf-16le',
'x-user-defined': 'x-user-defined',
}

View file

@ -0,0 +1,59 @@
"""
webencodings.mklabels
~~~~~~~~~~~~~~~~~~~~~
Regenarate the webencodings.labels module.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
import json
try:
from urllib import urlopen
except ImportError:
from urllib.request import urlopen
def assert_lower(string):
assert string == string.lower()
return string
def generate(url):
parts = ['''\
"""
webencodings.labels
~~~~~~~~~~~~~~~~~~~
Map encoding labels to their name.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
# XXX Do not edit!
# This file is automatically generated by mklabels.py
LABELS = {
''']
labels = [
(repr(assert_lower(label)).lstrip('u'),
repr(encoding['name']).lstrip('u'))
for category in json.loads(urlopen(url).read().decode('ascii'))
for encoding in category['encodings']
for label in encoding['labels']]
max_len = max(len(label) for label, name in labels)
parts.extend(
' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
for label, name in labels)
parts.append('}')
return ''.join(parts)
if __name__ == '__main__':
print(generate('http://encoding.spec.whatwg.org/encodings.json'))

153
lib/webencodings/tests.py Normal file
View file

@ -0,0 +1,153 @@
# coding: utf8
"""
webencodings.tests
~~~~~~~~~~~~~~~~~~
A basic test suite for Encoding.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
from __future__ import unicode_literals
from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
IncrementalDecoder, IncrementalEncoder, UTF8)
def assert_raises(exception, function, *args, **kwargs):
try:
function(*args, **kwargs)
except exception:
return
else: # pragma: no cover
raise AssertionError('Did not raise %s.' % exception)
def test_labels():
assert lookup('utf-8').name == 'utf-8'
assert lookup('Utf-8').name == 'utf-8'
assert lookup('UTF-8').name == 'utf-8'
assert lookup('utf8').name == 'utf-8'
assert lookup('utf8').name == 'utf-8'
assert lookup('utf8 ').name == 'utf-8'
assert lookup(' \r\nutf8\t').name == 'utf-8'
assert lookup('u8') is None # Python label.
assert lookup('utf-8 ') is None # Non-ASCII white space.
assert lookup('US-ASCII').name == 'windows-1252'
assert lookup('iso-8859-1').name == 'windows-1252'
assert lookup('latin1').name == 'windows-1252'
assert lookup('LATIN1').name == 'windows-1252'
assert lookup('latin-1') is None
assert lookup('LATİN1') is None # ASCII-only case insensitivity.
def test_all_labels():
for label in LABELS:
assert decode(b'', label) == ('', lookup(label))
assert encode('', label) == b''
for repeat in [0, 1, 12]:
output, _ = iter_decode([b''] * repeat, label)
assert list(output) == []
assert list(iter_encode([''] * repeat, label)) == []
decoder = IncrementalDecoder(label)
assert decoder.decode(b'') == ''
assert decoder.decode(b'', final=True) == ''
encoder = IncrementalEncoder(label)
assert encoder.encode('') == b''
assert encoder.encode('', final=True) == b''
# All encoding names are valid labels too:
for name in set(LABELS.values()):
assert lookup(name).name == name
def test_invalid_label():
assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
assert_raises(LookupError, encode, 'é', 'invalid')
assert_raises(LookupError, iter_decode, [], 'invalid')
assert_raises(LookupError, iter_encode, [], 'invalid')
assert_raises(LookupError, IncrementalDecoder, 'invalid')
assert_raises(LookupError, IncrementalEncoder, 'invalid')
def test_decode():
assert decode(b'\x80', 'latin1') == ('', lookup('latin1'))
assert decode(b'\x80', lookup('latin1')) == ('', lookup('latin1'))
assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
def test_encode():
assert encode('é', 'latin1') == b'\xe9'
assert encode('é', 'utf8') == b'\xc3\xa9'
assert encode('é', 'utf8') == b'\xc3\xa9'
assert encode('é', 'utf-16') == b'\xe9\x00'
assert encode('é', 'utf-16le') == b'\xe9\x00'
assert encode('é', 'utf-16be') == b'\x00\xe9'
def test_iter_decode():
def iter_decode_to_string(input, fallback_encoding):
output, _encoding = iter_decode(input, fallback_encoding)
return ''.join(output)
assert iter_decode_to_string([], 'latin1') == ''
assert iter_decode_to_string([b''], 'latin1') == ''
assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
assert iter_decode_to_string([
b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
assert iter_decode_to_string([
b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
assert iter_decode_to_string([
b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
assert iter_decode_to_string([
b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
assert iter_decode_to_string([
b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
def test_iter_encode():
assert b''.join(iter_encode([], 'latin1')) == b''
assert b''.join(iter_encode([''], 'latin1')) == b''
assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
assert b''.join(iter_encode([
'', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
def test_x_user_defined():
encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
encoded = b'aa'
decoded = 'aa'
assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
assert encode(decoded, 'x-user-defined') == encoded

View file

@ -0,0 +1,325 @@
# coding: utf8
"""
webencodings.x_user_defined
~~~~~~~~~~~~~~~~~~~~~~~~~~~
An implementation of the x-user-defined encoding.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
from __future__ import unicode_literals
import codecs
### Codec APIs
class Codec(codecs.Codec):
def encode(self, input, errors='strict'):
return codecs.charmap_encode(input, errors, encoding_table)
def decode(self, input, errors='strict'):
return codecs.charmap_decode(input, errors, decoding_table)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
class StreamWriter(Codec, codecs.StreamWriter):
pass
class StreamReader(Codec, codecs.StreamReader):
pass
### encodings module API
codec_info = codecs.CodecInfo(
name='x-user-defined',
encode=Codec().encode,
decode=Codec().decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
### Decoding Table
# Python 3:
# for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700))
decoding_table = (
'\x00'
'\x01'
'\x02'
'\x03'
'\x04'
'\x05'
'\x06'
'\x07'
'\x08'
'\t'
'\n'
'\x0b'
'\x0c'
'\r'
'\x0e'
'\x0f'
'\x10'
'\x11'
'\x12'
'\x13'
'\x14'
'\x15'
'\x16'
'\x17'
'\x18'
'\x19'
'\x1a'
'\x1b'
'\x1c'
'\x1d'
'\x1e'
'\x1f'
' '
'!'
'"'
'#'
'$'
'%'
'&'
"'"
'('
')'
'*'
'+'
','
'-'
'.'
'/'
'0'
'1'
'2'
'3'
'4'
'5'
'6'
'7'
'8'
'9'
':'
';'
'<'
'='
'>'
'?'
'@'
'A'
'B'
'C'
'D'
'E'
'F'
'G'
'H'
'I'
'J'
'K'
'L'
'M'
'N'
'O'
'P'
'Q'
'R'
'S'
'T'
'U'
'V'
'W'
'X'
'Y'
'Z'
'['
'\\'
']'
'^'
'_'
'`'
'a'
'b'
'c'
'd'
'e'
'f'
'g'
'h'
'i'
'j'
'k'
'l'
'm'
'n'
'o'
'p'
'q'
'r'
's'
't'
'u'
'v'
'w'
'x'
'y'
'z'
'{'
'|'
'}'
'~'
'\x7f'
'\uf780'
'\uf781'
'\uf782'
'\uf783'
'\uf784'
'\uf785'
'\uf786'
'\uf787'
'\uf788'
'\uf789'
'\uf78a'
'\uf78b'
'\uf78c'
'\uf78d'
'\uf78e'
'\uf78f'
'\uf790'
'\uf791'
'\uf792'
'\uf793'
'\uf794'
'\uf795'
'\uf796'
'\uf797'
'\uf798'
'\uf799'
'\uf79a'
'\uf79b'
'\uf79c'
'\uf79d'
'\uf79e'
'\uf79f'
'\uf7a0'
'\uf7a1'
'\uf7a2'
'\uf7a3'
'\uf7a4'
'\uf7a5'
'\uf7a6'
'\uf7a7'
'\uf7a8'
'\uf7a9'
'\uf7aa'
'\uf7ab'
'\uf7ac'
'\uf7ad'
'\uf7ae'
'\uf7af'
'\uf7b0'
'\uf7b1'
'\uf7b2'
'\uf7b3'
'\uf7b4'
'\uf7b5'
'\uf7b6'
'\uf7b7'
'\uf7b8'
'\uf7b9'
'\uf7ba'
'\uf7bb'
'\uf7bc'
'\uf7bd'
'\uf7be'
'\uf7bf'
'\uf7c0'
'\uf7c1'
'\uf7c2'
'\uf7c3'
'\uf7c4'
'\uf7c5'
'\uf7c6'
'\uf7c7'
'\uf7c8'
'\uf7c9'
'\uf7ca'
'\uf7cb'
'\uf7cc'
'\uf7cd'
'\uf7ce'
'\uf7cf'
'\uf7d0'
'\uf7d1'
'\uf7d2'
'\uf7d3'
'\uf7d4'
'\uf7d5'
'\uf7d6'
'\uf7d7'
'\uf7d8'
'\uf7d9'
'\uf7da'
'\uf7db'
'\uf7dc'
'\uf7dd'
'\uf7de'
'\uf7df'
'\uf7e0'
'\uf7e1'
'\uf7e2'
'\uf7e3'
'\uf7e4'
'\uf7e5'
'\uf7e6'
'\uf7e7'
'\uf7e8'
'\uf7e9'
'\uf7ea'
'\uf7eb'
'\uf7ec'
'\uf7ed'
'\uf7ee'
'\uf7ef'
'\uf7f0'
'\uf7f1'
'\uf7f2'
'\uf7f3'
'\uf7f4'
'\uf7f5'
'\uf7f6'
'\uf7f7'
'\uf7f8'
'\uf7f9'
'\uf7fa'
'\uf7fb'
'\uf7fc'
'\uf7fd'
'\uf7fe'
'\uf7ff'
)
### Encoding table
encoding_table = codecs.charmap_build(decoding_table)