mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-01 08:53:37 +00:00
Merge pull request #899 from JackDandy/feature/AddWebencodings
Add webencodings 0.5 (3970651) to assist parsing legacy web content.
This commit is contained in:
commit
e75a5d7c34
6 changed files with 1111 additions and 0 deletions
|
@ -6,6 +6,7 @@
|
|||
* Change improve page load time when loading images
|
||||
* Update isotope library 2.2.2 to 3.0.1
|
||||
* Add lazyload package 3.0.0 (2e318b1)
|
||||
* Add webencodings 0.5 (3970651) to assist parsing legacy web content
|
||||
* Change improve add show search results by comparing search term to an additional unidecoded result set
|
||||
* Change webserver startup to correctly use xheaders in reverse proxy or load balance set-ups
|
||||
* Update backports_abc 0.4 to 0.5
|
||||
|
|
342
lib/webencodings/__init__.py
Normal file
342
lib/webencodings/__init__.py
Normal file
|
@ -0,0 +1,342 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
|
||||
webencodings
|
||||
~~~~~~~~~~~~
|
||||
|
||||
This is a Python implementation of the `WHATWG Encoding standard
|
||||
<http://encoding.spec.whatwg.org/>`. See README for details.
|
||||
|
||||
:copyright: Copyright 2012 by Simon Sapin
|
||||
:license: BSD, see LICENSE for details.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import codecs
|
||||
|
||||
from .labels import LABELS
|
||||
|
||||
|
||||
VERSION = '0.5'
|
||||
|
||||
|
||||
# Some names in Encoding are not valid Python aliases. Remap these.
|
||||
PYTHON_NAMES = {
|
||||
'iso-8859-8-i': 'iso-8859-8',
|
||||
'x-mac-cyrillic': 'mac-cyrillic',
|
||||
'macintosh': 'mac-roman',
|
||||
'windows-874': 'cp874'}
|
||||
|
||||
CACHE = {}
|
||||
|
||||
|
||||
def ascii_lower(string):
|
||||
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
|
||||
|
||||
:param string: An Unicode string.
|
||||
:returns: A new Unicode string.
|
||||
|
||||
This is used for `ASCII case-insensitive
|
||||
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
|
||||
matching of encoding labels.
|
||||
The same matching is also used, among other things,
|
||||
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
|
||||
|
||||
This is different from the :meth:`~py:str.lower` method of Unicode strings
|
||||
which also affect non-ASCII characters,
|
||||
sometimes mapping them into the ASCII range:
|
||||
|
||||
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
|
||||
>>> assert keyword.lower() == u'background'
|
||||
>>> assert ascii_lower(keyword) != keyword.lower()
|
||||
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
|
||||
|
||||
"""
|
||||
# This turns out to be faster than unicode.translate()
|
||||
return string.encode('utf8').lower().decode('utf8')
|
||||
|
||||
|
||||
def lookup(label):
|
||||
"""
|
||||
Look for an encoding by its label.
|
||||
This is the spec’s `get an encoding
|
||||
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
|
||||
Supported labels are listed there.
|
||||
|
||||
:param label: A string.
|
||||
:returns:
|
||||
An :class:`Encoding` object, or :obj:`None` for an unknown label.
|
||||
|
||||
"""
|
||||
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
|
||||
label = ascii_lower(label.strip('\t\n\f\r '))
|
||||
name = LABELS.get(label)
|
||||
if name is None:
|
||||
return None
|
||||
encoding = CACHE.get(name)
|
||||
if encoding is None:
|
||||
if name == 'x-user-defined':
|
||||
from .x_user_defined import codec_info
|
||||
else:
|
||||
python_name = PYTHON_NAMES.get(name, name)
|
||||
# Any python_name value that gets to here should be valid.
|
||||
codec_info = codecs.lookup(python_name)
|
||||
encoding = Encoding(name, codec_info)
|
||||
CACHE[name] = encoding
|
||||
return encoding
|
||||
|
||||
|
||||
def _get_encoding(encoding_or_label):
|
||||
"""
|
||||
Accept either an encoding object or label.
|
||||
|
||||
:param encoding: An :class:`Encoding` object or a label string.
|
||||
:returns: An :class:`Encoding` object.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown label.
|
||||
|
||||
"""
|
||||
if hasattr(encoding_or_label, 'codec_info'):
|
||||
return encoding_or_label
|
||||
|
||||
encoding = lookup(encoding_or_label)
|
||||
if encoding is None:
|
||||
raise LookupError('Unknown encoding label: %r' % encoding_or_label)
|
||||
return encoding
|
||||
|
||||
|
||||
class Encoding(object):
|
||||
"""Reresents a character encoding such as UTF-8,
|
||||
that can be used for decoding or encoding.
|
||||
|
||||
.. attribute:: name
|
||||
|
||||
Canonical name of the encoding
|
||||
|
||||
.. attribute:: codec_info
|
||||
|
||||
The actual implementation of the encoding,
|
||||
a stdlib :class:`~codecs.CodecInfo` object.
|
||||
See :func:`codecs.register`.
|
||||
|
||||
"""
|
||||
def __init__(self, name, codec_info):
|
||||
self.name = name
|
||||
self.codec_info = codec_info
|
||||
|
||||
def __repr__(self):
|
||||
return '<Encoding %s>' % self.name
|
||||
|
||||
|
||||
#: The UTF-8 encoding. Should be used for new content and formats.
|
||||
UTF8 = lookup('utf-8')
|
||||
|
||||
_UTF16LE = lookup('utf-16le')
|
||||
_UTF16BE = lookup('utf-16be')
|
||||
|
||||
|
||||
def decode(input, fallback_encoding, errors='replace'):
|
||||
"""
|
||||
Decode a single string.
|
||||
|
||||
:param input: A byte string
|
||||
:param fallback_encoding:
|
||||
An :class:`Encoding` object or a label string.
|
||||
The encoding to use if :obj:`input` does note have a BOM.
|
||||
:param errors: Type of error handling. See :func:`codecs.register`.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
|
||||
:return:
|
||||
A ``(output, encoding)`` tuple of an Unicode string
|
||||
and an :obj:`Encoding`.
|
||||
|
||||
"""
|
||||
# Fail early if `encoding` is an invalid label.
|
||||
fallback_encoding = _get_encoding(fallback_encoding)
|
||||
bom_encoding, input = _detect_bom(input)
|
||||
encoding = bom_encoding or fallback_encoding
|
||||
return encoding.codec_info.decode(input, errors)[0], encoding
|
||||
|
||||
|
||||
def _detect_bom(input):
|
||||
"""Return (bom_encoding, input), with any BOM removed from the input."""
|
||||
if input.startswith(b'\xFF\xFE'):
|
||||
return _UTF16LE, input[2:]
|
||||
if input.startswith(b'\xFE\xFF'):
|
||||
return _UTF16BE, input[2:]
|
||||
if input.startswith(b'\xEF\xBB\xBF'):
|
||||
return UTF8, input[3:]
|
||||
return None, input
|
||||
|
||||
|
||||
def encode(input, encoding=UTF8, errors='strict'):
|
||||
"""
|
||||
Encode a single string.
|
||||
|
||||
:param input: An Unicode string.
|
||||
:param encoding: An :class:`Encoding` object or a label string.
|
||||
:param errors: Type of error handling. See :func:`codecs.register`.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
|
||||
:return: A byte string.
|
||||
|
||||
"""
|
||||
return _get_encoding(encoding).codec_info.encode(input, errors)[0]
|
||||
|
||||
|
||||
def iter_decode(input, fallback_encoding, errors='replace'):
|
||||
"""
|
||||
"Pull"-based decoder.
|
||||
|
||||
:param input:
|
||||
An iterable of byte strings.
|
||||
|
||||
The input is first consumed just enough to determine the encoding
|
||||
based on the precense of a BOM,
|
||||
then consumed on demand when the return value is.
|
||||
:param fallback_encoding:
|
||||
An :class:`Encoding` object or a label string.
|
||||
The encoding to use if :obj:`input` does note have a BOM.
|
||||
:param errors: Type of error handling. See :func:`codecs.register`.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
|
||||
:returns:
|
||||
An ``(output, encoding)`` tuple.
|
||||
:obj:`output` is an iterable of Unicode strings,
|
||||
:obj:`encoding` is the :obj:`Encoding` that is being used.
|
||||
|
||||
"""
|
||||
|
||||
decoder = IncrementalDecoder(fallback_encoding, errors)
|
||||
generator = _iter_decode_generator(input, decoder)
|
||||
encoding = next(generator)
|
||||
return generator, encoding
|
||||
|
||||
|
||||
def _iter_decode_generator(input, decoder):
|
||||
"""Return a generator that first yields the :obj:`Encoding`,
|
||||
then yields output chukns as Unicode strings.
|
||||
|
||||
"""
|
||||
decode = decoder.decode
|
||||
input = iter(input)
|
||||
for chunck in input:
|
||||
output = decode(chunck)
|
||||
if output:
|
||||
assert decoder.encoding is not None
|
||||
yield decoder.encoding
|
||||
yield output
|
||||
break
|
||||
else:
|
||||
# Input exhausted without determining the encoding
|
||||
output = decode(b'', final=True)
|
||||
assert decoder.encoding is not None
|
||||
yield decoder.encoding
|
||||
if output:
|
||||
yield output
|
||||
return
|
||||
|
||||
for chunck in input:
|
||||
output = decode(chunck)
|
||||
if output:
|
||||
yield output
|
||||
output = decode(b'', final=True)
|
||||
if output:
|
||||
yield output
|
||||
|
||||
|
||||
def iter_encode(input, encoding=UTF8, errors='strict'):
|
||||
"""
|
||||
“Pull”-based encoder.
|
||||
|
||||
:param input: An iterable of Unicode strings.
|
||||
:param encoding: An :class:`Encoding` object or a label string.
|
||||
:param errors: Type of error handling. See :func:`codecs.register`.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
|
||||
:returns: An iterable of byte strings.
|
||||
|
||||
"""
|
||||
# Fail early if `encoding` is an invalid label.
|
||||
encode = IncrementalEncoder(encoding, errors).encode
|
||||
return _iter_encode_generator(input, encode)
|
||||
|
||||
|
||||
def _iter_encode_generator(input, encode):
|
||||
for chunck in input:
|
||||
output = encode(chunck)
|
||||
if output:
|
||||
yield output
|
||||
output = encode('', final=True)
|
||||
if output:
|
||||
yield output
|
||||
|
||||
|
||||
class IncrementalDecoder(object):
|
||||
"""
|
||||
“Push”-based decoder.
|
||||
|
||||
:param fallback_encoding:
|
||||
An :class:`Encoding` object or a label string.
|
||||
The encoding to use if :obj:`input` does note have a BOM.
|
||||
:param errors: Type of error handling. See :func:`codecs.register`.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
|
||||
|
||||
"""
|
||||
def __init__(self, fallback_encoding, errors='replace'):
|
||||
# Fail early if `encoding` is an invalid label.
|
||||
self._fallback_encoding = _get_encoding(fallback_encoding)
|
||||
self._errors = errors
|
||||
self._buffer = b''
|
||||
self._decoder = None
|
||||
#: The actual :class:`Encoding` that is being used,
|
||||
#: or :obj:`None` if that is not determined yet.
|
||||
#: (Ie. if there is not enough input yet to determine
|
||||
#: if there is a BOM.)
|
||||
self.encoding = None # Not known yet.
|
||||
|
||||
def decode(self, input, final=False):
|
||||
"""Decode one chunk of the input.
|
||||
|
||||
:param input: A byte string.
|
||||
:param final:
|
||||
Indicate that no more input is available.
|
||||
Must be :obj:`True` if this is the last call.
|
||||
:returns: An Unicode string.
|
||||
|
||||
"""
|
||||
decoder = self._decoder
|
||||
if decoder is not None:
|
||||
return decoder(input, final)
|
||||
|
||||
input = self._buffer + input
|
||||
encoding, input = _detect_bom(input)
|
||||
if encoding is None:
|
||||
if len(input) < 3 and not final: # Not enough data yet.
|
||||
self._buffer = input
|
||||
return ''
|
||||
else: # No BOM
|
||||
encoding = self._fallback_encoding
|
||||
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
|
||||
self._decoder = decoder
|
||||
self.encoding = encoding
|
||||
return decoder(input, final)
|
||||
|
||||
|
||||
class IncrementalEncoder(object):
|
||||
"""
|
||||
“Push”-based encoder.
|
||||
|
||||
:param encoding: An :class:`Encoding` object or a label string.
|
||||
:param errors: Type of error handling. See :func:`codecs.register`.
|
||||
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
|
||||
|
||||
.. method:: encode(input, final=False)
|
||||
|
||||
:param input: An Unicode string.
|
||||
:param final:
|
||||
Indicate that no more input is available.
|
||||
Must be :obj:`True` if this is the last call.
|
||||
:returns: A byte string.
|
||||
|
||||
"""
|
||||
def __init__(self, encoding=UTF8, errors='strict'):
|
||||
encoding = _get_encoding(encoding)
|
||||
self.encode = encoding.codec_info.incrementalencoder(errors).encode
|
231
lib/webencodings/labels.py
Normal file
231
lib/webencodings/labels.py
Normal file
|
@ -0,0 +1,231 @@
|
|||
"""
|
||||
|
||||
webencodings.labels
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Map encoding labels to their name.
|
||||
|
||||
:copyright: Copyright 2012 by Simon Sapin
|
||||
:license: BSD, see LICENSE for details.
|
||||
|
||||
"""
|
||||
|
||||
# XXX Do not edit!
|
||||
# This file is automatically generated by mklabels.py
|
||||
|
||||
LABELS = {
|
||||
'unicode-1-1-utf-8': 'utf-8',
|
||||
'utf-8': 'utf-8',
|
||||
'utf8': 'utf-8',
|
||||
'866': 'ibm866',
|
||||
'cp866': 'ibm866',
|
||||
'csibm866': 'ibm866',
|
||||
'ibm866': 'ibm866',
|
||||
'csisolatin2': 'iso-8859-2',
|
||||
'iso-8859-2': 'iso-8859-2',
|
||||
'iso-ir-101': 'iso-8859-2',
|
||||
'iso8859-2': 'iso-8859-2',
|
||||
'iso88592': 'iso-8859-2',
|
||||
'iso_8859-2': 'iso-8859-2',
|
||||
'iso_8859-2:1987': 'iso-8859-2',
|
||||
'l2': 'iso-8859-2',
|
||||
'latin2': 'iso-8859-2',
|
||||
'csisolatin3': 'iso-8859-3',
|
||||
'iso-8859-3': 'iso-8859-3',
|
||||
'iso-ir-109': 'iso-8859-3',
|
||||
'iso8859-3': 'iso-8859-3',
|
||||
'iso88593': 'iso-8859-3',
|
||||
'iso_8859-3': 'iso-8859-3',
|
||||
'iso_8859-3:1988': 'iso-8859-3',
|
||||
'l3': 'iso-8859-3',
|
||||
'latin3': 'iso-8859-3',
|
||||
'csisolatin4': 'iso-8859-4',
|
||||
'iso-8859-4': 'iso-8859-4',
|
||||
'iso-ir-110': 'iso-8859-4',
|
||||
'iso8859-4': 'iso-8859-4',
|
||||
'iso88594': 'iso-8859-4',
|
||||
'iso_8859-4': 'iso-8859-4',
|
||||
'iso_8859-4:1988': 'iso-8859-4',
|
||||
'l4': 'iso-8859-4',
|
||||
'latin4': 'iso-8859-4',
|
||||
'csisolatincyrillic': 'iso-8859-5',
|
||||
'cyrillic': 'iso-8859-5',
|
||||
'iso-8859-5': 'iso-8859-5',
|
||||
'iso-ir-144': 'iso-8859-5',
|
||||
'iso8859-5': 'iso-8859-5',
|
||||
'iso88595': 'iso-8859-5',
|
||||
'iso_8859-5': 'iso-8859-5',
|
||||
'iso_8859-5:1988': 'iso-8859-5',
|
||||
'arabic': 'iso-8859-6',
|
||||
'asmo-708': 'iso-8859-6',
|
||||
'csiso88596e': 'iso-8859-6',
|
||||
'csiso88596i': 'iso-8859-6',
|
||||
'csisolatinarabic': 'iso-8859-6',
|
||||
'ecma-114': 'iso-8859-6',
|
||||
'iso-8859-6': 'iso-8859-6',
|
||||
'iso-8859-6-e': 'iso-8859-6',
|
||||
'iso-8859-6-i': 'iso-8859-6',
|
||||
'iso-ir-127': 'iso-8859-6',
|
||||
'iso8859-6': 'iso-8859-6',
|
||||
'iso88596': 'iso-8859-6',
|
||||
'iso_8859-6': 'iso-8859-6',
|
||||
'iso_8859-6:1987': 'iso-8859-6',
|
||||
'csisolatingreek': 'iso-8859-7',
|
||||
'ecma-118': 'iso-8859-7',
|
||||
'elot_928': 'iso-8859-7',
|
||||
'greek': 'iso-8859-7',
|
||||
'greek8': 'iso-8859-7',
|
||||
'iso-8859-7': 'iso-8859-7',
|
||||
'iso-ir-126': 'iso-8859-7',
|
||||
'iso8859-7': 'iso-8859-7',
|
||||
'iso88597': 'iso-8859-7',
|
||||
'iso_8859-7': 'iso-8859-7',
|
||||
'iso_8859-7:1987': 'iso-8859-7',
|
||||
'sun_eu_greek': 'iso-8859-7',
|
||||
'csiso88598e': 'iso-8859-8',
|
||||
'csisolatinhebrew': 'iso-8859-8',
|
||||
'hebrew': 'iso-8859-8',
|
||||
'iso-8859-8': 'iso-8859-8',
|
||||
'iso-8859-8-e': 'iso-8859-8',
|
||||
'iso-ir-138': 'iso-8859-8',
|
||||
'iso8859-8': 'iso-8859-8',
|
||||
'iso88598': 'iso-8859-8',
|
||||
'iso_8859-8': 'iso-8859-8',
|
||||
'iso_8859-8:1988': 'iso-8859-8',
|
||||
'visual': 'iso-8859-8',
|
||||
'csiso88598i': 'iso-8859-8-i',
|
||||
'iso-8859-8-i': 'iso-8859-8-i',
|
||||
'logical': 'iso-8859-8-i',
|
||||
'csisolatin6': 'iso-8859-10',
|
||||
'iso-8859-10': 'iso-8859-10',
|
||||
'iso-ir-157': 'iso-8859-10',
|
||||
'iso8859-10': 'iso-8859-10',
|
||||
'iso885910': 'iso-8859-10',
|
||||
'l6': 'iso-8859-10',
|
||||
'latin6': 'iso-8859-10',
|
||||
'iso-8859-13': 'iso-8859-13',
|
||||
'iso8859-13': 'iso-8859-13',
|
||||
'iso885913': 'iso-8859-13',
|
||||
'iso-8859-14': 'iso-8859-14',
|
||||
'iso8859-14': 'iso-8859-14',
|
||||
'iso885914': 'iso-8859-14',
|
||||
'csisolatin9': 'iso-8859-15',
|
||||
'iso-8859-15': 'iso-8859-15',
|
||||
'iso8859-15': 'iso-8859-15',
|
||||
'iso885915': 'iso-8859-15',
|
||||
'iso_8859-15': 'iso-8859-15',
|
||||
'l9': 'iso-8859-15',
|
||||
'iso-8859-16': 'iso-8859-16',
|
||||
'cskoi8r': 'koi8-r',
|
||||
'koi': 'koi8-r',
|
||||
'koi8': 'koi8-r',
|
||||
'koi8-r': 'koi8-r',
|
||||
'koi8_r': 'koi8-r',
|
||||
'koi8-u': 'koi8-u',
|
||||
'csmacintosh': 'macintosh',
|
||||
'mac': 'macintosh',
|
||||
'macintosh': 'macintosh',
|
||||
'x-mac-roman': 'macintosh',
|
||||
'dos-874': 'windows-874',
|
||||
'iso-8859-11': 'windows-874',
|
||||
'iso8859-11': 'windows-874',
|
||||
'iso885911': 'windows-874',
|
||||
'tis-620': 'windows-874',
|
||||
'windows-874': 'windows-874',
|
||||
'cp1250': 'windows-1250',
|
||||
'windows-1250': 'windows-1250',
|
||||
'x-cp1250': 'windows-1250',
|
||||
'cp1251': 'windows-1251',
|
||||
'windows-1251': 'windows-1251',
|
||||
'x-cp1251': 'windows-1251',
|
||||
'ansi_x3.4-1968': 'windows-1252',
|
||||
'ascii': 'windows-1252',
|
||||
'cp1252': 'windows-1252',
|
||||
'cp819': 'windows-1252',
|
||||
'csisolatin1': 'windows-1252',
|
||||
'ibm819': 'windows-1252',
|
||||
'iso-8859-1': 'windows-1252',
|
||||
'iso-ir-100': 'windows-1252',
|
||||
'iso8859-1': 'windows-1252',
|
||||
'iso88591': 'windows-1252',
|
||||
'iso_8859-1': 'windows-1252',
|
||||
'iso_8859-1:1987': 'windows-1252',
|
||||
'l1': 'windows-1252',
|
||||
'latin1': 'windows-1252',
|
||||
'us-ascii': 'windows-1252',
|
||||
'windows-1252': 'windows-1252',
|
||||
'x-cp1252': 'windows-1252',
|
||||
'cp1253': 'windows-1253',
|
||||
'windows-1253': 'windows-1253',
|
||||
'x-cp1253': 'windows-1253',
|
||||
'cp1254': 'windows-1254',
|
||||
'csisolatin5': 'windows-1254',
|
||||
'iso-8859-9': 'windows-1254',
|
||||
'iso-ir-148': 'windows-1254',
|
||||
'iso8859-9': 'windows-1254',
|
||||
'iso88599': 'windows-1254',
|
||||
'iso_8859-9': 'windows-1254',
|
||||
'iso_8859-9:1989': 'windows-1254',
|
||||
'l5': 'windows-1254',
|
||||
'latin5': 'windows-1254',
|
||||
'windows-1254': 'windows-1254',
|
||||
'x-cp1254': 'windows-1254',
|
||||
'cp1255': 'windows-1255',
|
||||
'windows-1255': 'windows-1255',
|
||||
'x-cp1255': 'windows-1255',
|
||||
'cp1256': 'windows-1256',
|
||||
'windows-1256': 'windows-1256',
|
||||
'x-cp1256': 'windows-1256',
|
||||
'cp1257': 'windows-1257',
|
||||
'windows-1257': 'windows-1257',
|
||||
'x-cp1257': 'windows-1257',
|
||||
'cp1258': 'windows-1258',
|
||||
'windows-1258': 'windows-1258',
|
||||
'x-cp1258': 'windows-1258',
|
||||
'x-mac-cyrillic': 'x-mac-cyrillic',
|
||||
'x-mac-ukrainian': 'x-mac-cyrillic',
|
||||
'chinese': 'gbk',
|
||||
'csgb2312': 'gbk',
|
||||
'csiso58gb231280': 'gbk',
|
||||
'gb2312': 'gbk',
|
||||
'gb_2312': 'gbk',
|
||||
'gb_2312-80': 'gbk',
|
||||
'gbk': 'gbk',
|
||||
'iso-ir-58': 'gbk',
|
||||
'x-gbk': 'gbk',
|
||||
'gb18030': 'gb18030',
|
||||
'hz-gb-2312': 'hz-gb-2312',
|
||||
'big5': 'big5',
|
||||
'big5-hkscs': 'big5',
|
||||
'cn-big5': 'big5',
|
||||
'csbig5': 'big5',
|
||||
'x-x-big5': 'big5',
|
||||
'cseucpkdfmtjapanese': 'euc-jp',
|
||||
'euc-jp': 'euc-jp',
|
||||
'x-euc-jp': 'euc-jp',
|
||||
'csiso2022jp': 'iso-2022-jp',
|
||||
'iso-2022-jp': 'iso-2022-jp',
|
||||
'csshiftjis': 'shift_jis',
|
||||
'ms_kanji': 'shift_jis',
|
||||
'shift-jis': 'shift_jis',
|
||||
'shift_jis': 'shift_jis',
|
||||
'sjis': 'shift_jis',
|
||||
'windows-31j': 'shift_jis',
|
||||
'x-sjis': 'shift_jis',
|
||||
'cseuckr': 'euc-kr',
|
||||
'csksc56011987': 'euc-kr',
|
||||
'euc-kr': 'euc-kr',
|
||||
'iso-ir-149': 'euc-kr',
|
||||
'korean': 'euc-kr',
|
||||
'ks_c_5601-1987': 'euc-kr',
|
||||
'ks_c_5601-1989': 'euc-kr',
|
||||
'ksc5601': 'euc-kr',
|
||||
'ksc_5601': 'euc-kr',
|
||||
'windows-949': 'euc-kr',
|
||||
'csiso2022kr': 'iso-2022-kr',
|
||||
'iso-2022-kr': 'iso-2022-kr',
|
||||
'utf-16be': 'utf-16be',
|
||||
'utf-16': 'utf-16le',
|
||||
'utf-16le': 'utf-16le',
|
||||
'x-user-defined': 'x-user-defined',
|
||||
}
|
59
lib/webencodings/mklabels.py
Normal file
59
lib/webencodings/mklabels.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
"""
|
||||
|
||||
webencodings.mklabels
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Regenarate the webencodings.labels module.
|
||||
|
||||
:copyright: Copyright 2012 by Simon Sapin
|
||||
:license: BSD, see LICENSE for details.
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
try:
|
||||
from urllib import urlopen
|
||||
except ImportError:
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
def assert_lower(string):
|
||||
assert string == string.lower()
|
||||
return string
|
||||
|
||||
|
||||
def generate(url):
|
||||
parts = ['''\
|
||||
"""
|
||||
|
||||
webencodings.labels
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Map encoding labels to their name.
|
||||
|
||||
:copyright: Copyright 2012 by Simon Sapin
|
||||
:license: BSD, see LICENSE for details.
|
||||
|
||||
"""
|
||||
|
||||
# XXX Do not edit!
|
||||
# This file is automatically generated by mklabels.py
|
||||
|
||||
LABELS = {
|
||||
''']
|
||||
labels = [
|
||||
(repr(assert_lower(label)).lstrip('u'),
|
||||
repr(encoding['name']).lstrip('u'))
|
||||
for category in json.loads(urlopen(url).read().decode('ascii'))
|
||||
for encoding in category['encodings']
|
||||
for label in encoding['labels']]
|
||||
max_len = max(len(label) for label, name in labels)
|
||||
parts.extend(
|
||||
' %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
|
||||
for label, name in labels)
|
||||
parts.append('}')
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(generate('http://encoding.spec.whatwg.org/encodings.json'))
|
153
lib/webencodings/tests.py
Normal file
153
lib/webencodings/tests.py
Normal file
|
@ -0,0 +1,153 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
|
||||
webencodings.tests
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
A basic test suite for Encoding.
|
||||
|
||||
:copyright: Copyright 2012 by Simon Sapin
|
||||
:license: BSD, see LICENSE for details.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
|
||||
IncrementalDecoder, IncrementalEncoder, UTF8)
|
||||
|
||||
|
||||
def assert_raises(exception, function, *args, **kwargs):
|
||||
try:
|
||||
function(*args, **kwargs)
|
||||
except exception:
|
||||
return
|
||||
else: # pragma: no cover
|
||||
raise AssertionError('Did not raise %s.' % exception)
|
||||
|
||||
|
||||
def test_labels():
|
||||
assert lookup('utf-8').name == 'utf-8'
|
||||
assert lookup('Utf-8').name == 'utf-8'
|
||||
assert lookup('UTF-8').name == 'utf-8'
|
||||
assert lookup('utf8').name == 'utf-8'
|
||||
assert lookup('utf8').name == 'utf-8'
|
||||
assert lookup('utf8 ').name == 'utf-8'
|
||||
assert lookup(' \r\nutf8\t').name == 'utf-8'
|
||||
assert lookup('u8') is None # Python label.
|
||||
assert lookup('utf-8 ') is None # Non-ASCII white space.
|
||||
|
||||
assert lookup('US-ASCII').name == 'windows-1252'
|
||||
assert lookup('iso-8859-1').name == 'windows-1252'
|
||||
assert lookup('latin1').name == 'windows-1252'
|
||||
assert lookup('LATIN1').name == 'windows-1252'
|
||||
assert lookup('latin-1') is None
|
||||
assert lookup('LATİN1') is None # ASCII-only case insensitivity.
|
||||
|
||||
|
||||
def test_all_labels():
|
||||
for label in LABELS:
|
||||
assert decode(b'', label) == ('', lookup(label))
|
||||
assert encode('', label) == b''
|
||||
for repeat in [0, 1, 12]:
|
||||
output, _ = iter_decode([b''] * repeat, label)
|
||||
assert list(output) == []
|
||||
assert list(iter_encode([''] * repeat, label)) == []
|
||||
decoder = IncrementalDecoder(label)
|
||||
assert decoder.decode(b'') == ''
|
||||
assert decoder.decode(b'', final=True) == ''
|
||||
encoder = IncrementalEncoder(label)
|
||||
assert encoder.encode('') == b''
|
||||
assert encoder.encode('', final=True) == b''
|
||||
# All encoding names are valid labels too:
|
||||
for name in set(LABELS.values()):
|
||||
assert lookup(name).name == name
|
||||
|
||||
|
||||
def test_invalid_label():
|
||||
assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
|
||||
assert_raises(LookupError, encode, 'é', 'invalid')
|
||||
assert_raises(LookupError, iter_decode, [], 'invalid')
|
||||
assert_raises(LookupError, iter_encode, [], 'invalid')
|
||||
assert_raises(LookupError, IncrementalDecoder, 'invalid')
|
||||
assert_raises(LookupError, IncrementalEncoder, 'invalid')
|
||||
|
||||
|
||||
def test_decode():
|
||||
assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
|
||||
assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
|
||||
assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
|
||||
assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
|
||||
assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
|
||||
assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
|
||||
|
||||
assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
|
||||
assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
|
||||
assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
|
||||
assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
|
||||
|
||||
assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
|
||||
assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
|
||||
assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
|
||||
|
||||
assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
|
||||
assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
|
||||
assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
|
||||
|
||||
|
||||
def test_encode():
|
||||
assert encode('é', 'latin1') == b'\xe9'
|
||||
assert encode('é', 'utf8') == b'\xc3\xa9'
|
||||
assert encode('é', 'utf8') == b'\xc3\xa9'
|
||||
assert encode('é', 'utf-16') == b'\xe9\x00'
|
||||
assert encode('é', 'utf-16le') == b'\xe9\x00'
|
||||
assert encode('é', 'utf-16be') == b'\x00\xe9'
|
||||
|
||||
|
||||
def test_iter_decode():
|
||||
def iter_decode_to_string(input, fallback_encoding):
|
||||
output, _encoding = iter_decode(input, fallback_encoding)
|
||||
return ''.join(output)
|
||||
assert iter_decode_to_string([], 'latin1') == ''
|
||||
assert iter_decode_to_string([b''], 'latin1') == ''
|
||||
assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
|
||||
assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
|
||||
assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
|
||||
assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([
|
||||
b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([
|
||||
b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
|
||||
assert iter_decode_to_string([
|
||||
b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
|
||||
assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
|
||||
assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([
|
||||
b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
|
||||
assert iter_decode_to_string([
|
||||
b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
|
||||
|
||||
|
||||
def test_iter_encode():
|
||||
assert b''.join(iter_encode([], 'latin1')) == b''
|
||||
assert b''.join(iter_encode([''], 'latin1')) == b''
|
||||
assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
|
||||
assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
|
||||
assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
|
||||
assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
|
||||
assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
|
||||
assert b''.join(iter_encode([
|
||||
'', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
|
||||
|
||||
|
||||
def test_x_user_defined():
|
||||
encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
|
||||
decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
|
||||
encoded = b'aa'
|
||||
decoded = 'aa'
|
||||
assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
|
||||
assert encode(decoded, 'x-user-defined') == encoded
|
325
lib/webencodings/x_user_defined.py
Normal file
325
lib/webencodings/x_user_defined.py
Normal file
|
@ -0,0 +1,325 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
|
||||
webencodings.x_user_defined
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
An implementation of the x-user-defined encoding.
|
||||
|
||||
:copyright: Copyright 2012 by Simon Sapin
|
||||
:license: BSD, see LICENSE for details.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import codecs
|
||||
|
||||
|
||||
### Codec APIs
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
|
||||
def encode(self, input, errors='strict'):
|
||||
return codecs.charmap_encode(input, errors, encoding_table)
|
||||
|
||||
def decode(self, input, errors='strict'):
|
||||
return codecs.charmap_decode(input, errors, decoding_table)
|
||||
|
||||
|
||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||
def encode(self, input, final=False):
|
||||
return codecs.charmap_encode(input, self.errors, encoding_table)[0]
|
||||
|
||||
|
||||
class IncrementalDecoder(codecs.IncrementalDecoder):
|
||||
def decode(self, input, final=False):
|
||||
return codecs.charmap_decode(input, self.errors, decoding_table)[0]
|
||||
|
||||
|
||||
class StreamWriter(Codec, codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
|
||||
class StreamReader(Codec, codecs.StreamReader):
|
||||
pass
|
||||
|
||||
|
||||
### encodings module API
|
||||
|
||||
codec_info = codecs.CodecInfo(
|
||||
name='x-user-defined',
|
||||
encode=Codec().encode,
|
||||
decode=Codec().decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
)
|
||||
|
||||
|
||||
### Decoding Table
|
||||
|
||||
# Python 3:
|
||||
# for c in range(256): print(' %r' % chr(c if c < 128 else c + 0xF700))
|
||||
decoding_table = (
|
||||
'\x00'
|
||||
'\x01'
|
||||
'\x02'
|
||||
'\x03'
|
||||
'\x04'
|
||||
'\x05'
|
||||
'\x06'
|
||||
'\x07'
|
||||
'\x08'
|
||||
'\t'
|
||||
'\n'
|
||||
'\x0b'
|
||||
'\x0c'
|
||||
'\r'
|
||||
'\x0e'
|
||||
'\x0f'
|
||||
'\x10'
|
||||
'\x11'
|
||||
'\x12'
|
||||
'\x13'
|
||||
'\x14'
|
||||
'\x15'
|
||||
'\x16'
|
||||
'\x17'
|
||||
'\x18'
|
||||
'\x19'
|
||||
'\x1a'
|
||||
'\x1b'
|
||||
'\x1c'
|
||||
'\x1d'
|
||||
'\x1e'
|
||||
'\x1f'
|
||||
' '
|
||||
'!'
|
||||
'"'
|
||||
'#'
|
||||
'$'
|
||||
'%'
|
||||
'&'
|
||||
"'"
|
||||
'('
|
||||
')'
|
||||
'*'
|
||||
'+'
|
||||
','
|
||||
'-'
|
||||
'.'
|
||||
'/'
|
||||
'0'
|
||||
'1'
|
||||
'2'
|
||||
'3'
|
||||
'4'
|
||||
'5'
|
||||
'6'
|
||||
'7'
|
||||
'8'
|
||||
'9'
|
||||
':'
|
||||
';'
|
||||
'<'
|
||||
'='
|
||||
'>'
|
||||
'?'
|
||||
'@'
|
||||
'A'
|
||||
'B'
|
||||
'C'
|
||||
'D'
|
||||
'E'
|
||||
'F'
|
||||
'G'
|
||||
'H'
|
||||
'I'
|
||||
'J'
|
||||
'K'
|
||||
'L'
|
||||
'M'
|
||||
'N'
|
||||
'O'
|
||||
'P'
|
||||
'Q'
|
||||
'R'
|
||||
'S'
|
||||
'T'
|
||||
'U'
|
||||
'V'
|
||||
'W'
|
||||
'X'
|
||||
'Y'
|
||||
'Z'
|
||||
'['
|
||||
'\\'
|
||||
']'
|
||||
'^'
|
||||
'_'
|
||||
'`'
|
||||
'a'
|
||||
'b'
|
||||
'c'
|
||||
'd'
|
||||
'e'
|
||||
'f'
|
||||
'g'
|
||||
'h'
|
||||
'i'
|
||||
'j'
|
||||
'k'
|
||||
'l'
|
||||
'm'
|
||||
'n'
|
||||
'o'
|
||||
'p'
|
||||
'q'
|
||||
'r'
|
||||
's'
|
||||
't'
|
||||
'u'
|
||||
'v'
|
||||
'w'
|
||||
'x'
|
||||
'y'
|
||||
'z'
|
||||
'{'
|
||||
'|'
|
||||
'}'
|
||||
'~'
|
||||
'\x7f'
|
||||
'\uf780'
|
||||
'\uf781'
|
||||
'\uf782'
|
||||
'\uf783'
|
||||
'\uf784'
|
||||
'\uf785'
|
||||
'\uf786'
|
||||
'\uf787'
|
||||
'\uf788'
|
||||
'\uf789'
|
||||
'\uf78a'
|
||||
'\uf78b'
|
||||
'\uf78c'
|
||||
'\uf78d'
|
||||
'\uf78e'
|
||||
'\uf78f'
|
||||
'\uf790'
|
||||
'\uf791'
|
||||
'\uf792'
|
||||
'\uf793'
|
||||
'\uf794'
|
||||
'\uf795'
|
||||
'\uf796'
|
||||
'\uf797'
|
||||
'\uf798'
|
||||
'\uf799'
|
||||
'\uf79a'
|
||||
'\uf79b'
|
||||
'\uf79c'
|
||||
'\uf79d'
|
||||
'\uf79e'
|
||||
'\uf79f'
|
||||
'\uf7a0'
|
||||
'\uf7a1'
|
||||
'\uf7a2'
|
||||
'\uf7a3'
|
||||
'\uf7a4'
|
||||
'\uf7a5'
|
||||
'\uf7a6'
|
||||
'\uf7a7'
|
||||
'\uf7a8'
|
||||
'\uf7a9'
|
||||
'\uf7aa'
|
||||
'\uf7ab'
|
||||
'\uf7ac'
|
||||
'\uf7ad'
|
||||
'\uf7ae'
|
||||
'\uf7af'
|
||||
'\uf7b0'
|
||||
'\uf7b1'
|
||||
'\uf7b2'
|
||||
'\uf7b3'
|
||||
'\uf7b4'
|
||||
'\uf7b5'
|
||||
'\uf7b6'
|
||||
'\uf7b7'
|
||||
'\uf7b8'
|
||||
'\uf7b9'
|
||||
'\uf7ba'
|
||||
'\uf7bb'
|
||||
'\uf7bc'
|
||||
'\uf7bd'
|
||||
'\uf7be'
|
||||
'\uf7bf'
|
||||
'\uf7c0'
|
||||
'\uf7c1'
|
||||
'\uf7c2'
|
||||
'\uf7c3'
|
||||
'\uf7c4'
|
||||
'\uf7c5'
|
||||
'\uf7c6'
|
||||
'\uf7c7'
|
||||
'\uf7c8'
|
||||
'\uf7c9'
|
||||
'\uf7ca'
|
||||
'\uf7cb'
|
||||
'\uf7cc'
|
||||
'\uf7cd'
|
||||
'\uf7ce'
|
||||
'\uf7cf'
|
||||
'\uf7d0'
|
||||
'\uf7d1'
|
||||
'\uf7d2'
|
||||
'\uf7d3'
|
||||
'\uf7d4'
|
||||
'\uf7d5'
|
||||
'\uf7d6'
|
||||
'\uf7d7'
|
||||
'\uf7d8'
|
||||
'\uf7d9'
|
||||
'\uf7da'
|
||||
'\uf7db'
|
||||
'\uf7dc'
|
||||
'\uf7dd'
|
||||
'\uf7de'
|
||||
'\uf7df'
|
||||
'\uf7e0'
|
||||
'\uf7e1'
|
||||
'\uf7e2'
|
||||
'\uf7e3'
|
||||
'\uf7e4'
|
||||
'\uf7e5'
|
||||
'\uf7e6'
|
||||
'\uf7e7'
|
||||
'\uf7e8'
|
||||
'\uf7e9'
|
||||
'\uf7ea'
|
||||
'\uf7eb'
|
||||
'\uf7ec'
|
||||
'\uf7ed'
|
||||
'\uf7ee'
|
||||
'\uf7ef'
|
||||
'\uf7f0'
|
||||
'\uf7f1'
|
||||
'\uf7f2'
|
||||
'\uf7f3'
|
||||
'\uf7f4'
|
||||
'\uf7f5'
|
||||
'\uf7f6'
|
||||
'\uf7f7'
|
||||
'\uf7f8'
|
||||
'\uf7f9'
|
||||
'\uf7fa'
|
||||
'\uf7fb'
|
||||
'\uf7fc'
|
||||
'\uf7fd'
|
||||
'\uf7fe'
|
||||
'\uf7ff'
|
||||
)
|
||||
|
||||
### Encoding table
|
||||
encoding_table = codecs.charmap_build(decoding_table)
|
Loading…
Reference in a new issue