Merge branch 'feature/UpdateFeedParser' into develop

This commit is contained in:
JackDandy 2018-03-28 00:42:09 +01:00
commit 09d00d7d2c
5 changed files with 67 additions and 19 deletions

View file

@ -4,6 +4,7 @@
* Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e) * Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e)
* Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a) * Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a)
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f) * Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
[develop changelog] [develop changelog]

View file

@ -41,4 +41,10 @@ from .api import parse
from .datetimes import registerDateHandler from .datetimes import registerDateHandler
from .exceptions import * from .exceptions import *
api.USER_AGENT = USER_AGENT # If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1

View file

@ -75,17 +75,7 @@ except NameError:
# of pre-installed parsers until it finds one that supports everything we need. # of pre-installed parsers until it finds one that supports everything we need.
PREFERRED_XML_PARSERS = ["drv_libxml2"] PREFERRED_XML_PARSERS = ["drv_libxml2"]
# If you want feedparser to automatically resolve all relative URIs, set this
# to 1.
RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1.
SANITIZE_HTML = 1
_XML_AVAILABLE = True _XML_AVAILABLE = True
mixin.RESOLVE_RELATIVE_URIS = RESOLVE_RELATIVE_URIS
mixin.SANITIZE_HTML = SANITIZE_HTML
SUPPORTED_VERSIONS = { SUPPORTED_VERSIONS = {
'': 'unknown', '': 'unknown',
@ -175,17 +165,61 @@ StrictFeedParser = type(str('StrictFeedParser'), (
_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object _StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object
), {}) ), {})
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
'''Parse a feed from a URL, file, stream, or string. '''Parse a feed from a URL, file, stream, or string.
request_headers, if given, is a dict from http header name to value to add :param url_file_stream_or_string:
to the request; this overrides internally generated values. File-like object, URL, file path, or string. Both byte and text strings
are accepted. If necessary, encoding will be derived from the response
headers or automatically detected.
Note that strings may trigger network I/O or filesystem access
depending on the value. Wrap an untrusted string in
a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
pass untrusted strings to this function.
When a URL is not passed the feed location to use in relative URL
resolution should be passed in the ``Content-Location`` response header
(see ``response_headers`` below).
:param str etag: HTTP ``ETag`` request header.
:param modified: HTTP ``Last-Modified`` request header.
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
:class:`datetime.datetime`
:param str agent: HTTP ``User-Agent`` request header, which defaults to
the value of :data:`feedparser.USER_AGENT`.
:param referrer: HTTP ``Referer`` [sic] request header.
:param request_headers:
A mapping of HTTP header name to HTTP header value to add to the
request, overriding internally generated values.
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
:param response_headers:
A mapping of HTTP header name to HTTP header value. Multiple values may
be joined with a comma. If a HTTP request was made, these headers
override any matching headers in the response. Otherwise this specifies
the entirety of the response headers.
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
:param bool resolve_relative_uris:
Should feedparser attempt to resolve relative URIs absolute ones within
HTML content? Defaults to the value of
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
:param bool sanitize_html:
Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
:return: A :class:`FeedParserDict`. :return: A :class:`FeedParserDict`.
''' '''
if not agent or sanitize_html is None or resolve_relative_uris is None:
import feedparser
if not agent: if not agent:
agent = USER_AGENT agent = feedparser.USER_AGENT
if sanitize_html is None:
sanitize_html = feedparser.SANITIZE_HTML
if resolve_relative_uris is None:
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
result = FeedParserDict( result = FeedParserDict(
bozo = False, bozo = False,
entries = [], entries = [],
@ -220,6 +254,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
if use_strict_parser: if use_strict_parser:
# initialize the SAX parser # initialize the SAX parser
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8') feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
feedparser.resolve_relative_uris = resolve_relative_uris
feedparser.sanitize_html = sanitize_html
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
try: try:
@ -239,6 +275,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
use_strict_parser = 0 use_strict_parser = 0
if not use_strict_parser and _SGML_AVAILABLE: if not use_strict_parser and _SGML_AVAILABLE:
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
feedparser.resolve_relative_uris = resolve_relative_uris
feedparser.sanitize_html = sanitize_html
feedparser.feed(data.decode('utf-8', 'replace')) feedparser.feed(data.decode('utf-8', 'replace'))
result['feed'] = feedparser.feeddata result['feed'] = feedparser.feeddata
result['entries'] = feedparser.entries result['entries'] = feedparser.entries

View file

@ -34,7 +34,10 @@ import collections
import re import re
try: try:
import chardet try:
import cchardet as chardet
except ImportError:
import chardet
except ImportError: except ImportError:
chardet = None chardet = None
lazy_chardet_encoding = None lazy_chardet_encoding = None

View file

@ -515,12 +515,12 @@ class _FeedParserMixin(
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
# resolve relative URIs within embedded markup # resolve relative URIs within embedded markup
if is_htmlish and RESOLVE_RELATIVE_URIS: if is_htmlish and self.resolve_relative_uris:
if element in self.can_contain_relative_uris: if element in self.can_contain_relative_uris:
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
# sanitize embedded markup # sanitize embedded markup
if is_htmlish and SANITIZE_HTML: if is_htmlish and self.sanitize_html:
if element in self.can_contain_dangerous_markup: if element in self.can_contain_dangerous_markup:
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))