mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-22 09:33:37 +00:00
Merge branch 'feature/UpdateFeedParser' into develop
This commit is contained in:
commit
09d00d7d2c
5 changed files with 67 additions and 19 deletions
|
@ -4,6 +4,7 @@
|
|||
* Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e)
|
||||
* Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a)
|
||||
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
|
||||
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
||||
|
||||
[develop changelog]
|
||||
|
||||
|
|
|
@ -41,4 +41,10 @@ from .api import parse
|
|||
from .datetimes import registerDateHandler
|
||||
from .exceptions import *
|
||||
|
||||
api.USER_AGENT = USER_AGENT
|
||||
# If you want feedparser to automatically resolve all relative URIs, set this
|
||||
# to 1.
|
||||
RESOLVE_RELATIVE_URIS = 1
|
||||
|
||||
# If you want feedparser to automatically sanitize all potentially unsafe
|
||||
# HTML content, set this to 1.
|
||||
SANITIZE_HTML = 1
|
||||
|
|
|
@ -75,17 +75,7 @@ except NameError:
|
|||
# of pre-installed parsers until it finds one that supports everything we need.
|
||||
PREFERRED_XML_PARSERS = ["drv_libxml2"]
|
||||
|
||||
# If you want feedparser to automatically resolve all relative URIs, set this
|
||||
# to 1.
|
||||
RESOLVE_RELATIVE_URIS = 1
|
||||
|
||||
# If you want feedparser to automatically sanitize all potentially unsafe
|
||||
# HTML content, set this to 1.
|
||||
SANITIZE_HTML = 1
|
||||
|
||||
_XML_AVAILABLE = True
|
||||
mixin.RESOLVE_RELATIVE_URIS = RESOLVE_RELATIVE_URIS
|
||||
mixin.SANITIZE_HTML = SANITIZE_HTML
|
||||
|
||||
SUPPORTED_VERSIONS = {
|
||||
'': 'unknown',
|
||||
|
@ -175,17 +165,61 @@ StrictFeedParser = type(str('StrictFeedParser'), (
|
|||
_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object
|
||||
), {})
|
||||
|
||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
|
||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
|
||||
'''Parse a feed from a URL, file, stream, or string.
|
||||
|
||||
request_headers, if given, is a dict from http header name to value to add
|
||||
to the request; this overrides internally generated values.
|
||||
:param url_file_stream_or_string:
|
||||
File-like object, URL, file path, or string. Both byte and text strings
|
||||
are accepted. If necessary, encoding will be derived from the response
|
||||
headers or automatically detected.
|
||||
|
||||
Note that strings may trigger network I/O or filesystem access
|
||||
depending on the value. Wrap an untrusted string in
|
||||
a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
|
||||
pass untrusted strings to this function.
|
||||
|
||||
When a URL is not passed the feed location to use in relative URL
|
||||
resolution should be passed in the ``Content-Location`` response header
|
||||
(see ``response_headers`` below).
|
||||
|
||||
:param str etag: HTTP ``ETag`` request header.
|
||||
:param modified: HTTP ``Last-Modified`` request header.
|
||||
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
|
||||
:class:`datetime.datetime`
|
||||
:param str agent: HTTP ``User-Agent`` request header, which defaults to
|
||||
the value of :data:`feedparser.USER_AGENT`.
|
||||
:param referrer: HTTP ``Referer`` [sic] request header.
|
||||
:param request_headers:
|
||||
A mapping of HTTP header name to HTTP header value to add to the
|
||||
request, overriding internally generated values.
|
||||
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||
:param response_headers:
|
||||
A mapping of HTTP header name to HTTP header value. Multiple values may
|
||||
be joined with a comma. If a HTTP request was made, these headers
|
||||
override any matching headers in the response. Otherwise this specifies
|
||||
the entirety of the response headers.
|
||||
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||
|
||||
:param bool resolve_relative_uris:
|
||||
Should feedparser attempt to resolve relative URIs absolute ones within
|
||||
HTML content? Defaults to the value of
|
||||
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
|
||||
:param bool sanitize_html:
|
||||
Should feedparser skip HTML sanitization? Only disable this if you know
|
||||
what you are doing! Defaults to the value of
|
||||
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
|
||||
|
||||
:return: A :class:`FeedParserDict`.
|
||||
'''
|
||||
|
||||
if not agent or sanitize_html is None or resolve_relative_uris is None:
|
||||
import feedparser
|
||||
if not agent:
|
||||
agent = USER_AGENT
|
||||
agent = feedparser.USER_AGENT
|
||||
if sanitize_html is None:
|
||||
sanitize_html = feedparser.SANITIZE_HTML
|
||||
if resolve_relative_uris is None:
|
||||
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
|
||||
|
||||
result = FeedParserDict(
|
||||
bozo = False,
|
||||
entries = [],
|
||||
|
@ -220,6 +254,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||
if use_strict_parser:
|
||||
# initialize the SAX parser
|
||||
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||
feedparser.sanitize_html = sanitize_html
|
||||
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
||||
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||
try:
|
||||
|
@ -239,6 +275,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||
use_strict_parser = 0
|
||||
if not use_strict_parser and _SGML_AVAILABLE:
|
||||
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||
feedparser.sanitize_html = sanitize_html
|
||||
feedparser.feed(data.decode('utf-8', 'replace'))
|
||||
result['feed'] = feedparser.feeddata
|
||||
result['entries'] = feedparser.entries
|
||||
|
|
|
@ -34,7 +34,10 @@ import collections
|
|||
import re
|
||||
|
||||
try:
|
||||
import chardet
|
||||
try:
|
||||
import cchardet as chardet
|
||||
except ImportError:
|
||||
import chardet
|
||||
except ImportError:
|
||||
chardet = None
|
||||
lazy_chardet_encoding = None
|
||||
|
|
|
@ -515,12 +515,12 @@ class _FeedParserMixin(
|
|||
|
||||
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
||||
# resolve relative URIs within embedded markup
|
||||
if is_htmlish and RESOLVE_RELATIVE_URIS:
|
||||
if is_htmlish and self.resolve_relative_uris:
|
||||
if element in self.can_contain_relative_uris:
|
||||
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||
|
||||
# sanitize embedded markup
|
||||
if is_htmlish and SANITIZE_HTML:
|
||||
if is_htmlish and self.sanitize_html:
|
||||
if element in self.can_contain_dangerous_markup:
|
||||
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||
|
||||
|
|
Loading…
Reference in a new issue