From 8fe34fb5eb8c8432b4ea1bfafc90bbe6c3771870 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Mon, 26 Mar 2018 19:35:48 +0100 Subject: [PATCH] =?UTF-8?q?Update=20feedparser=20library=205.2.1=20(f1dd1b?= =?UTF-8?q?b)=20=E2=86=92=205.2.1=20(5646f4c)=20-=20Uses=20the=20faster=20?= =?UTF-8?q?cchardet=20if=20installed.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 1 + lib/feedparser/__init__.py | 8 ++++- lib/feedparser/api.py | 68 +++++++++++++++++++++++++++++-------- lib/feedparser/encodings.py | 5 ++- lib/feedparser/mixin.py | 4 +-- 5 files changed, 67 insertions(+), 19 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3ce3ecb9..8ecb35b7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ * Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e) * Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a) * Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f) +* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed [develop changelog] diff --git a/lib/feedparser/__init__.py b/lib/feedparser/__init__.py index a52b39a2..916a61c2 100644 --- a/lib/feedparser/__init__.py +++ b/lib/feedparser/__init__.py @@ -41,4 +41,10 @@ from .api import parse from .datetimes import registerDateHandler from .exceptions import * -api.USER_AGENT = USER_AGENT +# If you want feedparser to automatically resolve all relative URIs, set this +# to 1. +RESOLVE_RELATIVE_URIS = 1 + +# If you want feedparser to automatically sanitize all potentially unsafe +# HTML content, set this to 1. +SANITIZE_HTML = 1 diff --git a/lib/feedparser/api.py b/lib/feedparser/api.py index 614bd2d2..d2d97a64 100644 --- a/lib/feedparser/api.py +++ b/lib/feedparser/api.py @@ -75,17 +75,7 @@ except NameError: # of pre-installed parsers until it finds one that supports everything we need. PREFERRED_XML_PARSERS = ["drv_libxml2"] -# If you want feedparser to automatically resolve all relative URIs, set this -# to 1. -RESOLVE_RELATIVE_URIS = 1 - -# If you want feedparser to automatically sanitize all potentially unsafe -# HTML content, set this to 1. -SANITIZE_HTML = 1 - _XML_AVAILABLE = True -mixin.RESOLVE_RELATIVE_URIS = RESOLVE_RELATIVE_URIS -mixin.SANITIZE_HTML = SANITIZE_HTML SUPPORTED_VERSIONS = { '': 'unknown', @@ -175,17 +165,61 @@ StrictFeedParser = type(str('StrictFeedParser'), ( _StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object ), {}) -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None): '''Parse a feed from a URL, file, stream, or string. - request_headers, if given, is a dict from http header name to value to add - to the request; this overrides internally generated values. + :param url_file_stream_or_string: + File-like object, URL, file path, or string. Both byte and text strings + are accepted. If necessary, encoding will be derived from the response + headers or automatically detected. + + Note that strings may trigger network I/O or filesystem access + depending on the value. Wrap an untrusted string in + a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not + pass untrusted strings to this function. + + When a URL is not passed the feed location to use in relative URL + resolution should be passed in the ``Content-Location`` response header + (see ``response_headers`` below). + + :param str etag: HTTP ``ETag`` request header. + :param modified: HTTP ``Last-Modified`` request header. + :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or + :class:`datetime.datetime` + :param str agent: HTTP ``User-Agent`` request header, which defaults to + the value of :data:`feedparser.USER_AGENT`. + :param referrer: HTTP ``Referer`` [sic] request header. + :param request_headers: + A mapping of HTTP header name to HTTP header value to add to the + request, overriding internally generated values. + :type request_headers: :class:`dict` mapping :class:`str` to :class:`str` + :param response_headers: + A mapping of HTTP header name to HTTP header value. Multiple values may + be joined with a comma. If a HTTP request was made, these headers + override any matching headers in the response. Otherwise this specifies + the entirety of the response headers. + :type response_headers: :class:`dict` mapping :class:`str` to :class:`str` + + :param bool resolve_relative_uris: + Should feedparser attempt to resolve relative URIs absolute ones within + HTML content? Defaults to the value of + :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``. + :param bool sanitize_html: + Should feedparser skip HTML sanitization? Only disable this if you know + what you are doing! Defaults to the value of + :data:`feedparser.SANITIZE_HTML`, which is ``True``. :return: A :class:`FeedParserDict`. ''' - + if not agent or sanitize_html is None or resolve_relative_uris is None: + import feedparser if not agent: - agent = USER_AGENT + agent = feedparser.USER_AGENT + if sanitize_html is None: + sanitize_html = feedparser.SANITIZE_HTML + if resolve_relative_uris is None: + resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS + result = FeedParserDict( bozo = False, entries = [], @@ -220,6 +254,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer if use_strict_parser: # initialize the SAX parser feedparser = StrictFeedParser(baseuri, baselang, 'utf-8') + feedparser.resolve_relative_uris = resolve_relative_uris + feedparser.sanitize_html = sanitize_html saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) try: @@ -239,6 +275,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer use_strict_parser = 0 if not use_strict_parser and _SGML_AVAILABLE: feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) + feedparser.resolve_relative_uris = resolve_relative_uris + feedparser.sanitize_html = sanitize_html feedparser.feed(data.decode('utf-8', 'replace')) result['feed'] = feedparser.feeddata result['entries'] = feedparser.entries diff --git a/lib/feedparser/encodings.py b/lib/feedparser/encodings.py index 6bbdaf70..a5a7635d 100644 --- a/lib/feedparser/encodings.py +++ b/lib/feedparser/encodings.py @@ -34,7 +34,10 @@ import collections import re try: - import chardet + try: + import cchardet as chardet + except ImportError: + import chardet except ImportError: chardet = None lazy_chardet_encoding = None diff --git a/lib/feedparser/mixin.py b/lib/feedparser/mixin.py index 263bb0d0..5f97dc80 100644 --- a/lib/feedparser/mixin.py +++ b/lib/feedparser/mixin.py @@ -515,12 +515,12 @@ class _FeedParserMixin( is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types # resolve relative URIs within embedded markup - if is_htmlish and RESOLVE_RELATIVE_URIS: + if is_htmlish and self.resolve_relative_uris: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) # sanitize embedded markup - if is_htmlish and SANITIZE_HTML: + if is_htmlish and self.sanitize_html: if element in self.can_contain_dangerous_markup: output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))