Update feedparser library 5.2.1 (f1dd1bb) → 5.2.1 (5646f4c) - Uses the faster cchardet if installed.

2025-01-05 17:43:37 +00:00 · 2018-03-26 19:35:48 +01:00 · 2018-03-26 19:35:48 +01:00 · 8fe34fb5eb
commit 8fe34fb5eb
parent 54129c519c
5 changed files with 67 additions and 19 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -4,6 +4,7 @@
 * Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e)
 * Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a)
 * Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
+* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed

 [develop changelog]

--- a/lib/feedparser/init.py
+++ b/lib/feedparser/init.py
@ -41,4 +41,10 @@ from .api import parse
 from .datetimes import registerDateHandler
 from .exceptions import *

-api.USER_AGENT = USER_AGENT
+# If you want feedparser to automatically resolve all relative URIs, set this
+# to 1.
+RESOLVE_RELATIVE_URIS = 1
+
+# If you want feedparser to automatically sanitize all potentially unsafe
+# HTML content, set this to 1.
+SANITIZE_HTML = 1
--- a/lib/feedparser/api.py
+++ b/lib/feedparser/api.py
@ -75,17 +75,7 @@ except NameError:
 # of pre-installed parsers until it finds one that supports everything we need.
 PREFERRED_XML_PARSERS = ["drv_libxml2"]

-# If you want feedparser to automatically resolve all relative URIs, set this
-# to 1.
-RESOLVE_RELATIVE_URIS = 1
-
-# If you want feedparser to automatically sanitize all potentially unsafe
-# HTML content, set this to 1.
-SANITIZE_HTML = 1
-
 _XML_AVAILABLE = True
-mixin.RESOLVE_RELATIVE_URIS = RESOLVE_RELATIVE_URIS
-mixin.SANITIZE_HTML = SANITIZE_HTML

 SUPPORTED_VERSIONS = {
    '': 'unknown',
@ -175,17 +165,61 @@ StrictFeedParser = type(str('StrictFeedParser'), (
    _StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object
 ), {})

-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
    '''Parse a feed from a URL, file, stream, or string.

-    request_headers, if given, is a dict from http header name to value to add
-    to the request; this overrides internally generated values.
+    :param url_file_stream_or_string:
+        File-like object, URL, file path, or string. Both byte and text strings
+        are accepted. If necessary, encoding will be derived from the response
+        headers or automatically detected.
+
+        Note that strings may trigger network I/O or filesystem access
+        depending on the value. Wrap an untrusted string in
+        a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
+        pass untrusted strings to this function.
+
+        When a URL is not passed the feed location to use in relative URL
+        resolution should be passed in the ``Content-Location`` response header
+        (see ``response_headers`` below).
+
+    :param str etag: HTTP ``ETag`` request header.
+    :param modified: HTTP ``Last-Modified`` request header.
+    :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
+        :class:`datetime.datetime`
+    :param str agent: HTTP ``User-Agent`` request header, which defaults to
+        the value of :data:`feedparser.USER_AGENT`.
+    :param referrer: HTTP ``Referer`` [sic] request header.
+    :param request_headers:
+        A mapping of HTTP header name to HTTP header value to add to the
+        request, overriding internally generated values.
+    :type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
+    :param response_headers:
+        A mapping of HTTP header name to HTTP header value. Multiple values may
+        be joined with a comma. If a HTTP request was made, these headers
+        override any matching headers in the response. Otherwise this specifies
+        the entirety of the response headers.
+    :type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
+
+    :param bool resolve_relative_uris:
+        Should feedparser attempt to resolve relative URIs absolute ones within
+        HTML content?  Defaults to the value of
+        :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
+    :param bool sanitize_html:
+        Should feedparser skip HTML sanitization? Only disable this if you know
+        what you are doing!  Defaults to the value of
+        :data:`feedparser.SANITIZE_HTML`, which is ``True``.

    :return: A :class:`FeedParserDict`.
    '''
-
+    if not agent or sanitize_html is None or resolve_relative_uris is None:
+        import feedparser
    if not agent:
-        agent = USER_AGENT
+        agent = feedparser.USER_AGENT
+    if sanitize_html is None:
+        sanitize_html = feedparser.SANITIZE_HTML
+    if resolve_relative_uris is None:
+        resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
+
    result = FeedParserDict(
        bozo = False,
        entries = [],
@ -220,6 +254,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
    if use_strict_parser:
        # initialize the SAX parser
        feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
+        feedparser.resolve_relative_uris = resolve_relative_uris
+        feedparser.sanitize_html = sanitize_html
        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
        try:
@ -239,6 +275,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
            use_strict_parser = 0
    if not use_strict_parser and _SGML_AVAILABLE:
        feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+        feedparser.resolve_relative_uris = resolve_relative_uris
+        feedparser.sanitize_html = sanitize_html
        feedparser.feed(data.decode('utf-8', 'replace'))
    result['feed'] = feedparser.feeddata
    result['entries'] = feedparser.entries
--- a/lib/feedparser/encodings.py
+++ b/lib/feedparser/encodings.py
@ -34,7 +34,10 @@ import collections
 import re

 try:
-    import chardet
+    try:
+        import cchardet as chardet
+    except ImportError:
+        import chardet
 except ImportError:
    chardet = None
    lazy_chardet_encoding = None
--- a/lib/feedparser/mixin.py
+++ b/lib/feedparser/mixin.py
@ -515,12 +515,12 @@ class _FeedParserMixin(

        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
        # resolve relative URIs within embedded markup
-        if is_htmlish and RESOLVE_RELATIVE_URIS:
+        if is_htmlish and self.resolve_relative_uris:
            if element in self.can_contain_relative_uris:
                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))

        # sanitize embedded markup
-        if is_htmlish and SANITIZE_HTML:
+        if is_htmlish and self.sanitize_html:
            if element in self.can_contain_dangerous_markup:
                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))