mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-22 09:33:37 +00:00
Merge branch 'feature/UpdateFeedParser' into develop
This commit is contained in:
commit
09d00d7d2c
5 changed files with 67 additions and 19 deletions
|
@ -4,6 +4,7 @@
|
||||||
* Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e)
|
* Update cachecontrol library 0.12.3 (db54c40) to 0.12.4 (bd94f7e)
|
||||||
* Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a)
|
* Update chardet packages 3.0.4 (9b8c5c2) to 4.0.0 (b3d867a)
|
||||||
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
|
* Update dateutil library 2.6.1 (2f3a160) to 2.7.2 (ff03c0f)
|
||||||
|
* Update feedparser library 5.2.1 (f1dd1bb) to 5.2.1 (5646f4c) - Uses the faster cchardet if installed
|
||||||
|
|
||||||
[develop changelog]
|
[develop changelog]
|
||||||
|
|
||||||
|
|
|
@ -41,4 +41,10 @@ from .api import parse
|
||||||
from .datetimes import registerDateHandler
|
from .datetimes import registerDateHandler
|
||||||
from .exceptions import *
|
from .exceptions import *
|
||||||
|
|
||||||
api.USER_AGENT = USER_AGENT
|
# If you want feedparser to automatically resolve all relative URIs, set this
|
||||||
|
# to 1.
|
||||||
|
RESOLVE_RELATIVE_URIS = 1
|
||||||
|
|
||||||
|
# If you want feedparser to automatically sanitize all potentially unsafe
|
||||||
|
# HTML content, set this to 1.
|
||||||
|
SANITIZE_HTML = 1
|
||||||
|
|
|
@ -75,17 +75,7 @@ except NameError:
|
||||||
# of pre-installed parsers until it finds one that supports everything we need.
|
# of pre-installed parsers until it finds one that supports everything we need.
|
||||||
PREFERRED_XML_PARSERS = ["drv_libxml2"]
|
PREFERRED_XML_PARSERS = ["drv_libxml2"]
|
||||||
|
|
||||||
# If you want feedparser to automatically resolve all relative URIs, set this
|
|
||||||
# to 1.
|
|
||||||
RESOLVE_RELATIVE_URIS = 1
|
|
||||||
|
|
||||||
# If you want feedparser to automatically sanitize all potentially unsafe
|
|
||||||
# HTML content, set this to 1.
|
|
||||||
SANITIZE_HTML = 1
|
|
||||||
|
|
||||||
_XML_AVAILABLE = True
|
_XML_AVAILABLE = True
|
||||||
mixin.RESOLVE_RELATIVE_URIS = RESOLVE_RELATIVE_URIS
|
|
||||||
mixin.SANITIZE_HTML = SANITIZE_HTML
|
|
||||||
|
|
||||||
SUPPORTED_VERSIONS = {
|
SUPPORTED_VERSIONS = {
|
||||||
'': 'unknown',
|
'': 'unknown',
|
||||||
|
@ -175,17 +165,61 @@ StrictFeedParser = type(str('StrictFeedParser'), (
|
||||||
_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object
|
_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object
|
||||||
), {})
|
), {})
|
||||||
|
|
||||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
|
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
|
||||||
'''Parse a feed from a URL, file, stream, or string.
|
'''Parse a feed from a URL, file, stream, or string.
|
||||||
|
|
||||||
request_headers, if given, is a dict from http header name to value to add
|
:param url_file_stream_or_string:
|
||||||
to the request; this overrides internally generated values.
|
File-like object, URL, file path, or string. Both byte and text strings
|
||||||
|
are accepted. If necessary, encoding will be derived from the response
|
||||||
|
headers or automatically detected.
|
||||||
|
|
||||||
|
Note that strings may trigger network I/O or filesystem access
|
||||||
|
depending on the value. Wrap an untrusted string in
|
||||||
|
a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
|
||||||
|
pass untrusted strings to this function.
|
||||||
|
|
||||||
|
When a URL is not passed the feed location to use in relative URL
|
||||||
|
resolution should be passed in the ``Content-Location`` response header
|
||||||
|
(see ``response_headers`` below).
|
||||||
|
|
||||||
|
:param str etag: HTTP ``ETag`` request header.
|
||||||
|
:param modified: HTTP ``Last-Modified`` request header.
|
||||||
|
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
|
||||||
|
:class:`datetime.datetime`
|
||||||
|
:param str agent: HTTP ``User-Agent`` request header, which defaults to
|
||||||
|
the value of :data:`feedparser.USER_AGENT`.
|
||||||
|
:param referrer: HTTP ``Referer`` [sic] request header.
|
||||||
|
:param request_headers:
|
||||||
|
A mapping of HTTP header name to HTTP header value to add to the
|
||||||
|
request, overriding internally generated values.
|
||||||
|
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||||
|
:param response_headers:
|
||||||
|
A mapping of HTTP header name to HTTP header value. Multiple values may
|
||||||
|
be joined with a comma. If a HTTP request was made, these headers
|
||||||
|
override any matching headers in the response. Otherwise this specifies
|
||||||
|
the entirety of the response headers.
|
||||||
|
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||||
|
|
||||||
|
:param bool resolve_relative_uris:
|
||||||
|
Should feedparser attempt to resolve relative URIs absolute ones within
|
||||||
|
HTML content? Defaults to the value of
|
||||||
|
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
|
||||||
|
:param bool sanitize_html:
|
||||||
|
Should feedparser skip HTML sanitization? Only disable this if you know
|
||||||
|
what you are doing! Defaults to the value of
|
||||||
|
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
|
||||||
|
|
||||||
:return: A :class:`FeedParserDict`.
|
:return: A :class:`FeedParserDict`.
|
||||||
'''
|
'''
|
||||||
|
if not agent or sanitize_html is None or resolve_relative_uris is None:
|
||||||
|
import feedparser
|
||||||
if not agent:
|
if not agent:
|
||||||
agent = USER_AGENT
|
agent = feedparser.USER_AGENT
|
||||||
|
if sanitize_html is None:
|
||||||
|
sanitize_html = feedparser.SANITIZE_HTML
|
||||||
|
if resolve_relative_uris is None:
|
||||||
|
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
|
||||||
|
|
||||||
result = FeedParserDict(
|
result = FeedParserDict(
|
||||||
bozo = False,
|
bozo = False,
|
||||||
entries = [],
|
entries = [],
|
||||||
|
@ -220,6 +254,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
if use_strict_parser:
|
if use_strict_parser:
|
||||||
# initialize the SAX parser
|
# initialize the SAX parser
|
||||||
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
||||||
|
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||||
|
feedparser.sanitize_html = sanitize_html
|
||||||
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
||||||
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||||
try:
|
try:
|
||||||
|
@ -239,6 +275,8 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
use_strict_parser = 0
|
use_strict_parser = 0
|
||||||
if not use_strict_parser and _SGML_AVAILABLE:
|
if not use_strict_parser and _SGML_AVAILABLE:
|
||||||
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
||||||
|
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||||
|
feedparser.sanitize_html = sanitize_html
|
||||||
feedparser.feed(data.decode('utf-8', 'replace'))
|
feedparser.feed(data.decode('utf-8', 'replace'))
|
||||||
result['feed'] = feedparser.feeddata
|
result['feed'] = feedparser.feeddata
|
||||||
result['entries'] = feedparser.entries
|
result['entries'] = feedparser.entries
|
||||||
|
|
|
@ -34,7 +34,10 @@ import collections
|
||||||
import re
|
import re
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import chardet
|
try:
|
||||||
|
import cchardet as chardet
|
||||||
|
except ImportError:
|
||||||
|
import chardet
|
||||||
except ImportError:
|
except ImportError:
|
||||||
chardet = None
|
chardet = None
|
||||||
lazy_chardet_encoding = None
|
lazy_chardet_encoding = None
|
||||||
|
|
|
@ -515,12 +515,12 @@ class _FeedParserMixin(
|
||||||
|
|
||||||
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
|
||||||
# resolve relative URIs within embedded markup
|
# resolve relative URIs within embedded markup
|
||||||
if is_htmlish and RESOLVE_RELATIVE_URIS:
|
if is_htmlish and self.resolve_relative_uris:
|
||||||
if element in self.can_contain_relative_uris:
|
if element in self.can_contain_relative_uris:
|
||||||
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||||
|
|
||||||
# sanitize embedded markup
|
# sanitize embedded markup
|
||||||
if is_htmlish and SANITIZE_HTML:
|
if is_htmlish and self.sanitize_html:
|
||||||
if element in self.can_contain_dangerous_markup:
|
if element in self.can_contain_dangerous_markup:
|
||||||
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue