Merge branch 'feature/UpdateFeedParser' into develop

2025-01-08 11:03:38 +00:00 · 2018-03-28 00:42:09 +01:00 · 2018-03-28 00:42:09 +01:00 · 09d00d7d2c
commit 09d00d7d2c
parent 54129c519c 8fe34fb5eb
5 changed files with 67 additions and 19 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -4,6 +4,7 @@
 * Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e)
 * Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a)
 * Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
 * Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
 [develop changelog]
--- a/lib/feedparser/init.py
+++ b/lib/feedparser/init.py
@ -41,4 +41,10 @@ from .api import parse
 from .datetimes import registerDateHandler
 from .exceptions import *
-api.USER_AGENT = USER_AGENT
+# If you want feedparser to automatically resolve all relative URIs, set this
 # to 1.
 RESOLVE_RELATIVE_URIS = 1
 # If you want feedparser to automatically sanitize all potentially unsafe
 # HTML content, set this to 1.
 SANITIZE_HTML = 1
--- a/lib/feedparser/api.py
+++ b/lib/feedparser/api.py
@ -75,17 +75,7 @@ except NameError:
 # of pre-installed parsers until it finds one that supports everything we need.
 PREFERRED_XML_PARSERS = ["drv_libxml2"]
 # If you want feedparser to automatically resolve all relative URIs, set this
 # to 1.
 RESOLVE_RELATIVE_URIS = 1
 # If you want feedparser to automatically sanitize all potentially unsafe
 # HTML content, set this to 1.
 SANITIZE_HTML = 1
 _XML_AVAILABLE = True
 mixin.RESOLVE_RELATIVE_URIS = RESOLVE_RELATIVE_URIS
 mixin.SANITIZE_HTML = SANITIZE_HTML
 SUPPORTED_VERSIONS = {
    '': 'unknown',
@ -175,17 +165,61 @@ StrictFeedParser = type(str('StrictFeedParser'), (
    _StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object
 ), {})
-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
+def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
    '''Parse a feed from a URL, file, stream, or string.
-    request_headers, if given, is a dict from http header name to value to add
+    :param url_file_stream_or_string:
-    to the request; this overrides internally generated values.
+        File-like object, URL, file path, or string. Both byte and text strings
        are accepted. If necessary, encoding will be derived from the response
        headers or automatically detected.
        Note that strings may trigger network I/O or filesystem access
        depending on the value. Wrap an untrusted string in
        a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
        pass untrusted strings to this function.
        When a URL is not passed the feed location to use in relative URL
        resolution should be passed in the ``Content-Location`` response header
        (see ``response_headers`` below).
    :param str etag: HTTP ``ETag`` request header.
    :param modified: HTTP ``Last-Modified`` request header.
    :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
        :class:`datetime.datetime`
    :param str agent: HTTP ``User-Agent`` request header, which defaults to
        the value of :data:`feedparser.USER_AGENT`.
    :param referrer: HTTP ``Referer`` [sic] request header.
    :param request_headers:
        A mapping of HTTP header name to HTTP header value to add to the
        request, overriding internally generated values.
    :type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
    :param response_headers:
        A mapping of HTTP header name to HTTP header value. Multiple values may
        be joined with a comma. If a HTTP request was made, these headers
        override any matching headers in the response. Otherwise this specifies
        the entirety of the response headers.
    :type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
    :param bool resolve_relative_uris:
        Should feedparser attempt to resolve relative URIs absolute ones within
        HTML content?  Defaults to the value of
        :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
    :param bool sanitize_html:
        Should feedparser skip HTML sanitization? Only disable this if you know
        what you are doing!  Defaults to the value of
        :data:`feedparser.SANITIZE_HTML`, which is ``True``.
    :return: A :class:`FeedParserDict`.
    '''
-
+    if not agent or sanitize_html is None or resolve_relative_uris is None:
        import feedparser
    if not agent:
-        agent = USER_AGENT
+        agent = feedparser.USER_AGENT
    if sanitize_html is None:
        sanitize_html = feedparser.SANITIZE_HTML
    if resolve_relative_uris is None:
        resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
    result = FeedParserDict(
        bozo = False,
        entries = [],
@ -220,6 +254,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
    if use_strict_parser:
        # initialize the SAX parser
        feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
        feedparser.resolve_relative_uris = resolve_relative_uris
        feedparser.sanitize_html = sanitize_html
        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
        try:
@ -239,6 +275,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
            use_strict_parser = 0
    if not use_strict_parser and _SGML_AVAILABLE:
        feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
        feedparser.resolve_relative_uris = resolve_relative_uris
        feedparser.sanitize_html = sanitize_html
        feedparser.feed(data.decode('utf-8', 'replace'))
    result['feed'] = feedparser.feeddata
    result['entries'] = feedparser.entries
--- a/lib/feedparser/encodings.py
+++ b/lib/feedparser/encodings.py
@ -34,6 +34,9 @@ import collections
 import re
 try:
    try:
        import cchardet as chardet
    except ImportError:
        import chardet
 except ImportError:
    chardet = None
--- a/lib/feedparser/mixin.py
+++ b/lib/feedparser/mixin.py
@ -515,12 +515,12 @@ class _FeedParserMixin(
        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
        # resolve relative URIs within embedded markup
-        if is_htmlish and RESOLVE_RELATIVE_URIS:
+        if is_htmlish and self.resolve_relative_uris:
            if element in self.can_contain_relative_uris:
                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
        # sanitize embedded markup
-        if is_htmlish and SANITIZE_HTML:
+        if is_htmlish and self.sanitize_html:
            if element in self.can_contain_dangerous_markup:
                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))