mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-24 03:33:38 +00:00
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae).
This commit is contained in:
parent
7343f9ac16
commit
a65c40083f
32 changed files with 391 additions and 193 deletions
|
@ -7,6 +7,7 @@
|
|||
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
|
||||
* Update certifi 2022.09.24 to 2022.12.07
|
||||
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
|
||||
* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
|
||||
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
|
||||
* Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078)
|
||||
* Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -32,7 +32,7 @@ from .util import FeedParserDict
|
|||
|
||||
__author__ = 'Kurt McKee <contactme@kurtmckee.org>'
|
||||
__license__ = 'BSD 2-clause'
|
||||
__version__ = '6.0.1'
|
||||
__version__ = '6.0.10'
|
||||
|
||||
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
||||
# If you are embedding feedparser in a larger application, you should
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# The public API for feedparser
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -26,7 +26,11 @@
|
|||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import datetime
|
||||
import io
|
||||
import time
|
||||
from typing import Dict, List, Union
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import xml.sax
|
||||
|
||||
|
@ -34,13 +38,12 @@ import sgmllib3k as sgmllib
|
|||
|
||||
from .datetimes import registerDateHandler, _parse_date
|
||||
from .encodings import convert_to_utf8
|
||||
from .exceptions import *
|
||||
from .html import _BaseHTMLProcessor
|
||||
from .html import BaseHTMLProcessor
|
||||
from . import http
|
||||
from . import mixin
|
||||
from .mixin import _FeedParserMixin
|
||||
from .parsers.loose import _LooseFeedParser
|
||||
from .parsers.strict import _StrictFeedParser
|
||||
from .mixin import XMLParserMixin
|
||||
from .parsers.loose import LooseXMLParser
|
||||
from .parsers.strict import StrictXMLParser
|
||||
from .parsers.json import JSONParser
|
||||
from .sanitizer import replace_doctype
|
||||
from .urls import convert_to_idn, make_safe_absolute_uri
|
||||
from .util import FeedParserDict
|
||||
|
@ -70,6 +73,7 @@ SUPPORTED_VERSIONS = {
|
|||
'atom10': 'Atom 1.0',
|
||||
'atom': 'Atom (unknown version)',
|
||||
'cdf': 'CDF',
|
||||
'json1': 'JSON feed 1',
|
||||
}
|
||||
|
||||
|
||||
|
@ -136,20 +140,25 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
|
|||
return url_file_stream_or_string
|
||||
|
||||
|
||||
LooseFeedParser = type(
|
||||
'LooseFeedParser',
|
||||
(_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
|
||||
{},
|
||||
)
|
||||
class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
|
||||
pass
|
||||
|
||||
StrictFeedParser = type(
|
||||
'StrictFeedParser',
|
||||
(_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
|
||||
{},
|
||||
)
|
||||
class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler):
|
||||
pass
|
||||
|
||||
|
||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
|
||||
def parse(
|
||||
url_file_stream_or_string,
|
||||
etag: str = None,
|
||||
modified: Union[str, datetime.datetime, time.struct_time] = None,
|
||||
agent: str = None,
|
||||
referrer: str = None,
|
||||
handlers: List = None,
|
||||
request_headers: Dict[str, str] = None,
|
||||
response_headers: Dict[str, str] = None,
|
||||
resolve_relative_uris: bool = None,
|
||||
sanitize_html: bool = None,
|
||||
) -> FeedParserDict:
|
||||
"""Parse a feed from a URL, file, stream, or string.
|
||||
|
||||
:param url_file_stream_or_string:
|
||||
|
@ -165,45 +174,46 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||
When a URL is not passed the feed location to use in relative URL
|
||||
resolution should be passed in the ``Content-Location`` response header
|
||||
(see ``response_headers`` below).
|
||||
|
||||
:param str etag: HTTP ``ETag`` request header.
|
||||
:param modified: HTTP ``Last-Modified`` request header.
|
||||
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
|
||||
:class:`datetime.datetime`
|
||||
:param str agent: HTTP ``User-Agent`` request header, which defaults to
|
||||
:param etag:
|
||||
HTTP ``ETag`` request header.
|
||||
:param modified:
|
||||
HTTP ``Last-Modified`` request header.
|
||||
:param agent:
|
||||
HTTP ``User-Agent`` request header, which defaults to
|
||||
the value of :data:`feedparser.USER_AGENT`.
|
||||
:param referrer: HTTP ``Referer`` [sic] request header.
|
||||
:param referrer:
|
||||
HTTP ``Referer`` [sic] request header.
|
||||
:param handlers:
|
||||
A list of handlers that will be passed to urllib2.
|
||||
:param request_headers:
|
||||
A mapping of HTTP header name to HTTP header value to add to the
|
||||
request, overriding internally generated values.
|
||||
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||
:param response_headers:
|
||||
A mapping of HTTP header name to HTTP header value. Multiple values may
|
||||
be joined with a comma. If a HTTP request was made, these headers
|
||||
override any matching headers in the response. Otherwise this specifies
|
||||
the entirety of the response headers.
|
||||
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
||||
|
||||
:param bool resolve_relative_uris:
|
||||
:param resolve_relative_uris:
|
||||
Should feedparser attempt to resolve relative URIs absolute ones within
|
||||
HTML content? Defaults to the value of
|
||||
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
|
||||
:param bool sanitize_html:
|
||||
:param sanitize_html:
|
||||
Should feedparser skip HTML sanitization? Only disable this if you know
|
||||
what you are doing! Defaults to the value of
|
||||
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
|
||||
|
||||
:return: A :class:`FeedParserDict`.
|
||||
"""
|
||||
|
||||
if not agent or sanitize_html is None or resolve_relative_uris is None:
|
||||
import feedparser
|
||||
# Avoid a cyclic import.
|
||||
if not agent:
|
||||
import feedparser
|
||||
agent = feedparser.USER_AGENT
|
||||
if sanitize_html is None:
|
||||
sanitize_html = feedparser.SANITIZE_HTML
|
||||
import feedparser
|
||||
sanitize_html = bool(feedparser.SANITIZE_HTML)
|
||||
if resolve_relative_uris is None:
|
||||
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
|
||||
import feedparser
|
||||
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
|
||||
|
||||
result = FeedParserDict(
|
||||
bozo=False,
|
||||
|
@ -212,7 +222,14 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||
headers={},
|
||||
)
|
||||
|
||||
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
|
||||
try:
|
||||
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
|
||||
except urllib.error.URLError as error:
|
||||
result.update({
|
||||
'bozo': True,
|
||||
'bozo_exception': error,
|
||||
})
|
||||
return result
|
||||
|
||||
if not data:
|
||||
return result
|
||||
|
@ -221,9 +238,11 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||
result['headers'].update(response_headers or {})
|
||||
|
||||
data = convert_to_utf8(result['headers'], data, result)
|
||||
use_json_parser = result['content-type'] == 'application/json'
|
||||
use_strict_parser = result['encoding'] and True or False
|
||||
|
||||
result['version'], data, entities = replace_doctype(data)
|
||||
if not use_json_parser:
|
||||
result['version'], data, entities = replace_doctype(data)
|
||||
|
||||
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
|
||||
contentloc = result['headers'].get('content-location', '')
|
||||
|
@ -235,36 +254,52 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||
baselang = baselang.decode('utf-8', 'ignore')
|
||||
|
||||
if not _XML_AVAILABLE:
|
||||
use_strict_parser = 0
|
||||
if use_strict_parser:
|
||||
# initialize the SAX parser
|
||||
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||
feedparser.sanitize_html = sanitize_html
|
||||
use_strict_parser = False
|
||||
feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
|
||||
if use_json_parser:
|
||||
result['version'] = None
|
||||
feed_parser = JSONParser(baseuri, baselang, 'utf-8')
|
||||
try:
|
||||
feed_parser.feed(data)
|
||||
except Exception as e:
|
||||
result['bozo'] = 1
|
||||
result['bozo_exception'] = e
|
||||
elif use_strict_parser:
|
||||
# Initialize the SAX parser.
|
||||
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
||||
feed_parser.resolve_relative_uris = resolve_relative_uris
|
||||
feed_parser.sanitize_html = sanitize_html
|
||||
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
||||
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||
try:
|
||||
# disable downloading external doctype references, if possible
|
||||
# Disable downloading external doctype references, if possible.
|
||||
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
|
||||
except xml.sax.SAXNotSupportedException:
|
||||
pass
|
||||
saxparser.setContentHandler(feedparser)
|
||||
saxparser.setErrorHandler(feedparser)
|
||||
saxparser.setContentHandler(feed_parser)
|
||||
saxparser.setErrorHandler(feed_parser)
|
||||
source = xml.sax.xmlreader.InputSource()
|
||||
source.setByteStream(io.BytesIO(data))
|
||||
try:
|
||||
saxparser.parse(source)
|
||||
except xml.sax.SAXException as e:
|
||||
result['bozo'] = 1
|
||||
result['bozo_exception'] = feedparser.exc or e
|
||||
use_strict_parser = 0
|
||||
if not use_strict_parser:
|
||||
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
||||
feedparser.sanitize_html = sanitize_html
|
||||
feedparser.feed(data.decode('utf-8', 'replace'))
|
||||
result['feed'] = feedparser.feeddata
|
||||
result['entries'] = feedparser.entries
|
||||
result['version'] = result['version'] or feedparser.version
|
||||
result['namespaces'] = feedparser.namespaces_in_use
|
||||
result['bozo_exception'] = feed_parser.exc or e
|
||||
use_strict_parser = False
|
||||
|
||||
# The loose XML parser will be tried if the JSON parser was not used,
|
||||
# and if the strict XML parser was not used (or if it failed).
|
||||
if not use_json_parser and not use_strict_parser:
|
||||
feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
||||
feed_parser.resolve_relative_uris = resolve_relative_uris
|
||||
feed_parser.sanitize_html = sanitize_html
|
||||
feed_parser.feed(data.decode('utf-8', 'replace'))
|
||||
|
||||
result['feed'] = feed_parser.feeddata
|
||||
result['entries'] = feed_parser.entries
|
||||
result['version'] = result['version'] or feed_parser.version
|
||||
if isinstance(feed_parser, JSONParser):
|
||||
result['namespaces'] = {}
|
||||
else:
|
||||
result['namespaces'] = feed_parser.namespaces_in_use
|
||||
return result
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -25,6 +25,8 @@
|
|||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from time import struct_time
|
||||
from typing import Callable, List, Optional
|
||||
from .asctime import _parse_date_asctime
|
||||
from .greek import _parse_date_greek
|
||||
from .hungarian import _parse_date_hungarian
|
||||
|
@ -34,7 +36,7 @@ from .perforce import _parse_date_perforce
|
|||
from .rfc822 import _parse_date_rfc822
|
||||
from .w3dtf import _parse_date_w3dtf
|
||||
|
||||
_date_handlers = []
|
||||
_date_handlers: List[Callable[[str], Optional[struct_time]]] = []
|
||||
|
||||
|
||||
def registerDateHandler(func):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -68,15 +68,7 @@ _iso8601_re = [
|
|||
+ r'(\.(?P<fracsecond>\d+))?'
|
||||
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
|
||||
for tmpl in _iso8601_tmpl]
|
||||
try:
|
||||
del tmpl
|
||||
except NameError:
|
||||
pass
|
||||
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
|
||||
try:
|
||||
del regex
|
||||
except NameError:
|
||||
pass
|
||||
|
||||
|
||||
def _parse_date_iso8601(date_string):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -25,7 +25,7 @@
|
|||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import email._parseaddr
|
||||
import email.utils
|
||||
import re
|
||||
import time
|
||||
|
||||
|
@ -41,6 +41,6 @@ def _parse_date_perforce(date_string):
|
|||
dow, year, month, day, hour, minute, second, tz = m.groups()
|
||||
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
|
||||
tm = email._parseaddr.parsedate_tz(new_date_string)
|
||||
tm = email.utils.parsedate_tz(new_date_string)
|
||||
if tm:
|
||||
return time.gmtime(email._parseaddr.mktime_tz(tm))
|
||||
return time.gmtime(email.utils.mktime_tz(tm))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Character encoding routines
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -26,17 +26,16 @@
|
|||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import cgi
|
||||
import codecs
|
||||
import re
|
||||
import typing as t
|
||||
|
||||
try:
|
||||
try:
|
||||
import cchardet as chardet
|
||||
import cchardet as chardet # type: ignore[import]
|
||||
except ImportError:
|
||||
import chardet
|
||||
import chardet # type: ignore[no-redef]
|
||||
except ImportError:
|
||||
chardet = None
|
||||
lazy_chardet_encoding = None
|
||||
else:
|
||||
def lazy_chardet_encoding(data):
|
||||
|
@ -68,6 +67,30 @@ RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>')
|
|||
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
|
||||
|
||||
|
||||
def parse_content_type(line: str) -> t.Tuple[str, str]:
|
||||
"""Parse an HTTP Content-Type header.
|
||||
|
||||
The return value will be a tuple of strings:
|
||||
the MIME type, and the value of the "charset" (if any).
|
||||
|
||||
This is a custom replacement for Python's cgi.parse_header().
|
||||
The cgi module will be removed in Python 3.13.
|
||||
"""
|
||||
|
||||
chunks = line.split(";")
|
||||
if not chunks:
|
||||
return "", ""
|
||||
|
||||
mime_type = chunks[0].strip()
|
||||
charset_value = ""
|
||||
for chunk in chunks[1:]:
|
||||
key, _, value = chunk.partition("=")
|
||||
if key.strip().lower() == "charset":
|
||||
charset_value = value.strip().strip("\"'")
|
||||
|
||||
return mime_type, charset_value
|
||||
|
||||
|
||||
def convert_to_utf8(http_headers, data, result):
|
||||
"""Detect and convert the character encoding to UTF-8.
|
||||
|
||||
|
@ -156,10 +179,7 @@ def convert_to_utf8(http_headers, data, result):
|
|||
try:
|
||||
if bom_encoding:
|
||||
tempdata = data.decode(bom_encoding).encode('utf-8')
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
# feedparser recognizes UTF-32 encodings that aren't
|
||||
# available in Python 2.4 and 2.5, so it's possible to
|
||||
# encounter a LookupError during decoding.
|
||||
except UnicodeDecodeError:
|
||||
xml_encoding_match = None
|
||||
else:
|
||||
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
|
||||
|
@ -181,15 +201,14 @@ def convert_to_utf8(http_headers, data, result):
|
|||
# XML declaration encoding, and HTTP encoding, following the
|
||||
# heuristic defined in RFC 3023.
|
||||
http_content_type = http_headers.get('content-type') or ''
|
||||
http_content_type, params = cgi.parse_header(http_content_type)
|
||||
http_encoding = params.get('charset', '').replace("'", "")
|
||||
if isinstance(http_encoding, bytes):
|
||||
http_encoding = http_encoding.decode('utf-8', 'ignore')
|
||||
http_content_type, http_encoding = parse_content_type(http_content_type)
|
||||
|
||||
acceptable_content_type = 0
|
||||
application_content_types = ('application/xml', 'application/xml-dtd',
|
||||
'application/xml-external-parsed-entity')
|
||||
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
|
||||
json_content_types = ('application/feed+json', 'application/json')
|
||||
json = False
|
||||
if (
|
||||
http_content_type in application_content_types
|
||||
or (
|
||||
|
@ -208,6 +227,17 @@ def convert_to_utf8(http_headers, data, result):
|
|||
):
|
||||
acceptable_content_type = 1
|
||||
rfc3023_encoding = http_encoding or 'us-ascii'
|
||||
elif (
|
||||
http_content_type in json_content_types
|
||||
or (
|
||||
not http_content_type
|
||||
and data and data.lstrip()[0] == '{'
|
||||
)
|
||||
):
|
||||
http_content_type = json_content_types[0]
|
||||
acceptable_content_type = 1
|
||||
json = True
|
||||
rfc3023_encoding = http_encoding or 'utf-8' # RFC 7159, 8.1.
|
||||
elif http_content_type.startswith('text/'):
|
||||
rfc3023_encoding = http_encoding or 'us-ascii'
|
||||
elif http_headers and 'content-type' not in http_headers:
|
||||
|
@ -230,7 +260,7 @@ def convert_to_utf8(http_headers, data, result):
|
|||
|
||||
if http_headers and (not acceptable_content_type):
|
||||
if 'content-type' in http_headers:
|
||||
msg = '%s is not an XML media type' % http_headers['content-type']
|
||||
msg = '%s is not an accepted media type' % http_headers['content-type']
|
||||
else:
|
||||
msg = 'no Content-type specified'
|
||||
error = NonXMLContentType(msg)
|
||||
|
@ -254,12 +284,13 @@ def convert_to_utf8(http_headers, data, result):
|
|||
pass
|
||||
else:
|
||||
known_encoding = 1
|
||||
# Update the encoding in the opening XML processing instruction.
|
||||
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
|
||||
if RE_XML_DECLARATION.search(data):
|
||||
data = RE_XML_DECLARATION.sub(new_declaration, data)
|
||||
else:
|
||||
data = new_declaration + '\n' + data
|
||||
if not json:
|
||||
# Update the encoding in the opening XML processing instruction.
|
||||
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
|
||||
if RE_XML_DECLARATION.search(data):
|
||||
data = RE_XML_DECLARATION.sub(new_declaration, data)
|
||||
else:
|
||||
data = new_declaration + '\n' + data
|
||||
data = data.encode('utf-8')
|
||||
break
|
||||
# if still no luck, give up
|
||||
|
@ -275,6 +306,7 @@ def convert_to_utf8(http_headers, data, result):
|
|||
(rfc3023_encoding, proposed_encoding))
|
||||
rfc3023_encoding = proposed_encoding
|
||||
|
||||
result['content-type'] = http_content_type # for selecting the parser
|
||||
result['encoding'] = rfc3023_encoding
|
||||
if error:
|
||||
result['bozo'] = True
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Exceptions used throughout feedparser
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -27,7 +27,7 @@
|
|||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
__all__ = [
|
||||
'ThingsNobodyCaresAboutButMe',
|
||||
'FeedparserError',
|
||||
'CharacterEncodingOverride',
|
||||
'CharacterEncodingUnknown',
|
||||
'NonXMLContentType',
|
||||
|
@ -35,19 +35,19 @@ __all__ = [
|
|||
]
|
||||
|
||||
|
||||
class ThingsNobodyCaresAboutButMe(Exception):
|
||||
class FeedparserError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
|
||||
class CharacterEncodingOverride(FeedparserError):
|
||||
pass
|
||||
|
||||
|
||||
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
|
||||
class CharacterEncodingUnknown(FeedparserError):
|
||||
pass
|
||||
|
||||
|
||||
class NonXMLContentType(ThingsNobodyCaresAboutButMe):
|
||||
class NonXMLContentType(FeedparserError):
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -61,7 +61,7 @@ _cp1252 = {
|
|||
}
|
||||
|
||||
|
||||
class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||
class BaseHTMLProcessor(sgmllib.SGMLParser):
|
||||
special = re.compile("""[<>'"]""")
|
||||
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
|
||||
elements_no_end_tag = {
|
||||
|
@ -91,11 +91,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
|||
self.encoding = encoding
|
||||
self._type = _type
|
||||
self.pieces = []
|
||||
super(_BaseHTMLProcessor, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
def reset(self):
|
||||
self.pieces = []
|
||||
super(_BaseHTMLProcessor, self).reset()
|
||||
super().reset()
|
||||
|
||||
def _shorttag_replace(self, match):
|
||||
"""
|
||||
|
@ -118,23 +118,13 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
|||
raise NotImplementedError
|
||||
|
||||
# Replace goahead with SGMLParser's goahead() code object.
|
||||
try:
|
||||
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
|
||||
except AttributeError:
|
||||
# Python 2
|
||||
# noinspection PyUnresolvedReferences
|
||||
goahead.func_code = sgmllib.SGMLParser.goahead.func_code
|
||||
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
|
||||
|
||||
def __parse_starttag(self, i):
|
||||
raise NotImplementedError
|
||||
|
||||
# Replace __parse_starttag with SGMLParser's parse_starttag() code object.
|
||||
try:
|
||||
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
|
||||
except AttributeError:
|
||||
# Python 2
|
||||
# noinspection PyUnresolvedReferences
|
||||
__parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
|
||||
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
|
||||
|
||||
def parse_starttag(self, i):
|
||||
j = self.__parse_starttag(i)
|
||||
|
@ -153,8 +143,8 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
|||
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
|
||||
data = data.replace(''', "'")
|
||||
data = data.replace('"', '"')
|
||||
super(_BaseHTMLProcessor, self).feed(data)
|
||||
super(_BaseHTMLProcessor, self).close()
|
||||
super().feed(data)
|
||||
super().close()
|
||||
|
||||
@staticmethod
|
||||
def normalize_attrs(attrs):
|
||||
|
@ -315,8 +305,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
|||
# self.updatepos(declstartpos, i)
|
||||
return None, -1
|
||||
|
||||
@staticmethod
|
||||
def convert_charref(name):
|
||||
def convert_charref(self, name):
|
||||
"""
|
||||
:type name: str
|
||||
:rtype: str
|
||||
|
@ -324,8 +313,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
|||
|
||||
return '&#%s;' % name
|
||||
|
||||
@staticmethod
|
||||
def convert_entityref(name):
|
||||
def convert_entityref(self, name):
|
||||
"""
|
||||
:type name: str
|
||||
:rtype: str
|
||||
|
@ -349,7 +337,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
|||
|
||||
try:
|
||||
return sgmllib.SGMLParser.parse_declaration(self, i)
|
||||
except sgmllib.SGMLParseError:
|
||||
except (AssertionError, sgmllib.SGMLParseError):
|
||||
# Escape the doctype declaration and continue parsing.
|
||||
self.handle_data('<')
|
||||
return i+1
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -44,7 +44,7 @@ from .urls import convert_to_idn
|
|||
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
||||
|
||||
|
||||
class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
|
||||
class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
|
||||
def http_error_default(self, req, fp, code, msg, headers):
|
||||
# The default implementation just raises HTTPError.
|
||||
# Forget that.
|
||||
|
@ -53,6 +53,8 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
|
|||
|
||||
def http_error_301(self, req, fp, code, msg, hdrs):
|
||||
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
|
||||
if not result:
|
||||
return fp
|
||||
result.status = code
|
||||
result.newurl = result.geturl()
|
||||
return result
|
||||
|
@ -78,7 +80,7 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
|
|||
host = urllib.parse.urlparse(req.get_full_url())[1]
|
||||
if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
|
||||
return self.http_error_default(req, fp, code, msg, headers)
|
||||
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode('utf8'))
|
||||
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
|
||||
user, passw = auth.split(':')
|
||||
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
|
||||
self.add_password(realm, host, user, passw)
|
||||
|
@ -145,15 +147,26 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
|
|||
if url_pieces.port:
|
||||
new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
|
||||
url = urllib.parse.urlunparse(new_pieces)
|
||||
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}').strip()
|
||||
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
|
||||
|
||||
# iri support
|
||||
if not isinstance(url, bytes):
|
||||
url = convert_to_idn(url)
|
||||
|
||||
# Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
|
||||
bits = []
|
||||
for c in url:
|
||||
try:
|
||||
c.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
bits.append(urllib.parse.quote(c))
|
||||
else:
|
||||
bits.append(c)
|
||||
url = ''.join(bits)
|
||||
|
||||
# try to open with urllib2 (to use optional headers)
|
||||
request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
|
||||
opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
|
||||
opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
|
||||
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
|
||||
f = opener.open(request)
|
||||
data = f.read()
|
||||
|
@ -203,7 +216,7 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
|
|||
result['href'] = f.url.decode('utf-8', 'ignore')
|
||||
else:
|
||||
result['href'] = f.url
|
||||
result['status'] = getattr(f, 'status', 200)
|
||||
result['status'] = getattr(f, 'status', None) or 200
|
||||
|
||||
# Stop processing if the server sent HTTP 304 Not Modified.
|
||||
if getattr(f, 'code', 0) == 304:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -30,16 +30,17 @@ import binascii
|
|||
import copy
|
||||
import html.entities
|
||||
import re
|
||||
from typing import Dict
|
||||
import xml.sax.saxutils
|
||||
|
||||
from .html import _cp1252
|
||||
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
|
||||
from .sanitizer import _sanitize_html, _HTMLSanitizer
|
||||
from .sanitizer import sanitize_html, HTMLSanitizer
|
||||
from .util import FeedParserDict
|
||||
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
|
||||
|
||||
|
||||
class _FeedParserMixin(
|
||||
class XMLParserMixin(
|
||||
_base.Namespace,
|
||||
cc.Namespace,
|
||||
dc.Namespace,
|
||||
|
@ -118,7 +119,7 @@ class _FeedParserMixin(
|
|||
'http://www.w3.org/XML/1998/namespace': 'xml',
|
||||
'http://podlove.org/simple-chapters': 'psc',
|
||||
}
|
||||
_matchnamespaces = {}
|
||||
_matchnamespaces: Dict[str, str] = {}
|
||||
|
||||
can_be_relative_uri = {
|
||||
'comments',
|
||||
|
@ -170,6 +171,8 @@ class _FeedParserMixin(
|
|||
self.entries = [] # list of entry-level data
|
||||
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
|
||||
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
|
||||
self.resolve_relative_uris = False
|
||||
self.sanitize_html = False
|
||||
|
||||
# the following are used internally to track state;
|
||||
# this is really out of control and should be refactored
|
||||
|
@ -193,6 +196,7 @@ class _FeedParserMixin(
|
|||
self.svgOK = 0
|
||||
self.title_depth = -1
|
||||
self.depth = 0
|
||||
self.hasContent = 0
|
||||
if self.lang:
|
||||
self.feeddata['language'] = self.lang.replace('_', '-')
|
||||
|
||||
|
@ -204,7 +208,7 @@ class _FeedParserMixin(
|
|||
# },
|
||||
# }
|
||||
self.property_depth_map = {}
|
||||
super(_FeedParserMixin, self).__init__()
|
||||
super(XMLParserMixin, self).__init__()
|
||||
|
||||
def _normalize_attributes(self, kv):
|
||||
raise NotImplementedError
|
||||
|
@ -506,9 +510,7 @@ class _FeedParserMixin(
|
|||
if base64 and self.contentparams.get('base64', 0):
|
||||
try:
|
||||
output = base64.decodebytes(output.encode('utf8')).decode('utf8')
|
||||
except binascii.Error:
|
||||
pass
|
||||
except binascii.Incomplete:
|
||||
except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
|
||||
pass
|
||||
|
||||
# resolve relative URIs
|
||||
|
@ -546,7 +548,7 @@ class _FeedParserMixin(
|
|||
# sanitize embedded markup
|
||||
if is_htmlish and self.sanitize_html:
|
||||
if element in self.can_contain_dangerous_markup:
|
||||
output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||
output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||
|
||||
if self.encoding and isinstance(output, bytes):
|
||||
output = output.decode(self.encoding, 'ignore')
|
||||
|
@ -648,7 +650,7 @@ class _FeedParserMixin(
|
|||
return False
|
||||
|
||||
# all tags must be in a restricted subset of valid HTML tags
|
||||
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in _HTMLSanitizer.acceptable_elements)):
|
||||
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in HTMLSanitizer.acceptable_elements)):
|
||||
return False
|
||||
|
||||
# all entities must have been defined as valid HTML entities
|
||||
|
@ -744,7 +746,7 @@ class _FeedParserMixin(
|
|||
author, email = context.get(key), None
|
||||
if not author:
|
||||
return
|
||||
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
|
||||
emailmatch = re.search(r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))(\?subject=\S+)?", author)
|
||||
if emailmatch:
|
||||
email = emailmatch.group(0)
|
||||
# probably a better way to do the following, but it passes
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the Atom, RSS, RDF, and CDF feed formats
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -259,6 +259,7 @@ class Namespace(object):
|
|||
def _end_item(self):
|
||||
self.pop('item')
|
||||
self.inentry = 0
|
||||
self.hasContent = 0
|
||||
_end_entry = _end_item
|
||||
|
||||
def _start_language(self, attrs_d):
|
||||
|
@ -388,7 +389,7 @@ class Namespace(object):
|
|||
|
||||
def _start_description(self, attrs_d):
|
||||
context = self._get_context()
|
||||
if 'summary' in context:
|
||||
if 'summary' in context and not self.hasContent:
|
||||
self._summaryKey = 'content'
|
||||
self._start_content(attrs_d)
|
||||
else:
|
||||
|
@ -429,7 +430,7 @@ class Namespace(object):
|
|||
|
||||
def _start_summary(self, attrs_d):
|
||||
context = self._get_context()
|
||||
if 'summary' in context:
|
||||
if 'summary' in context and not self.hasContent:
|
||||
self._summaryKey = 'content'
|
||||
self._start_content(attrs_d)
|
||||
else:
|
||||
|
@ -466,6 +467,7 @@ class Namespace(object):
|
|||
self.sourcedata.clear()
|
||||
|
||||
def _start_content(self, attrs_d):
|
||||
self.hasContent = 1
|
||||
self.push_content('content', attrs_d, 'text/plain', 1)
|
||||
src = attrs_d.get('src')
|
||||
if src:
|
||||
|
@ -477,6 +479,7 @@ class Namespace(object):
|
|||
_start_xhtml_body = _start_body
|
||||
|
||||
def _start_content_encoded(self, attrs_d):
|
||||
self.hasContent = 1
|
||||
self.push_content('content', attrs_d, 'text/html', 1)
|
||||
_start_fullitem = _start_content_encoded
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the administrative elements extension
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the Creative Commons licensing extensions
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the Dublin Core metadata extensions
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the GeoRSS format
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -91,6 +91,8 @@ class Namespace(object):
|
|||
except ValueError:
|
||||
srs_dimension = 2
|
||||
context = self._get_context()
|
||||
if 'where' not in context:
|
||||
context['where'] = {}
|
||||
context['where']['srsName'] = srs_name
|
||||
context['where']['srsDimension'] = srs_dimension
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the iTunes format
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the Media RSS format
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Support for the Podlove Simple Chapters format
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
|
133
lib/feedparser/parsers/json.py
Normal file
133
lib/feedparser/parsers/json.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
# The JSON feed parser
|
||||
# Copyright 2017 Beat Bolli
|
||||
# All rights reserved.
|
||||
#
|
||||
# This file is a part of feedparser.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import json
|
||||
|
||||
from ..datetimes import _parse_date
|
||||
from ..sanitizer import sanitize_html
|
||||
from ..util import FeedParserDict
|
||||
|
||||
|
||||
class JSONParser:
|
||||
VERSIONS = {
|
||||
'https://jsonfeed.org/version/1': 'json1',
|
||||
'https://jsonfeed.org/version/1.1': 'json11',
|
||||
}
|
||||
FEED_FIELDS = (
|
||||
('title', 'title'),
|
||||
('icon', 'image'),
|
||||
('home_page_url', 'link'),
|
||||
('description', 'description'),
|
||||
)
|
||||
ITEM_FIELDS = (
|
||||
('title', 'title'),
|
||||
('id', 'guid'),
|
||||
('url', 'link'),
|
||||
('summary', 'summary'),
|
||||
('external_url', 'source'),
|
||||
)
|
||||
|
||||
def __init__(self, baseuri=None, baselang=None, encoding=None):
|
||||
self.baseuri = baseuri or ''
|
||||
self.lang = baselang or None
|
||||
self.encoding = encoding or 'utf-8' # character encoding
|
||||
|
||||
self.version = None
|
||||
self.feeddata = FeedParserDict()
|
||||
self.namespacesInUse = []
|
||||
self.entries = []
|
||||
|
||||
def feed(self, data):
|
||||
data = json.loads(data)
|
||||
|
||||
v = data.get('version', '')
|
||||
try:
|
||||
self.version = self.VERSIONS[v]
|
||||
except KeyError:
|
||||
raise ValueError("Unrecognized JSONFeed version '%s'" % v)
|
||||
|
||||
for src, dst in self.FEED_FIELDS:
|
||||
if src in data:
|
||||
self.feeddata[dst] = data[src]
|
||||
if 'author' in data:
|
||||
self.parse_author(data['author'], self.feeddata)
|
||||
# TODO: hubs; expired has no RSS equivalent
|
||||
|
||||
self.entries = [self.parse_entry(e) for e in data['items']]
|
||||
|
||||
def parse_entry(self, e):
|
||||
entry = FeedParserDict()
|
||||
for src, dst in self.ITEM_FIELDS:
|
||||
if src in e:
|
||||
entry[dst] = e[src]
|
||||
|
||||
if 'content_text' in e:
|
||||
entry['content'] = c = FeedParserDict()
|
||||
c['value'] = e['content_text']
|
||||
c['type'] = 'text'
|
||||
elif 'content_html' in e:
|
||||
entry['content'] = c = FeedParserDict()
|
||||
c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json')
|
||||
c['type'] = 'html'
|
||||
|
||||
if 'date_published' in e:
|
||||
entry['published'] = e['date_published']
|
||||
entry['published_parsed'] = _parse_date(e['date_published'])
|
||||
if 'date_updated' in e:
|
||||
entry['updated'] = e['date_modified']
|
||||
entry['updated_parsed'] = _parse_date(e['date_modified'])
|
||||
|
||||
if 'tags' in e:
|
||||
entry['category'] = e['tags']
|
||||
|
||||
if 'author' in e:
|
||||
self.parse_author(e['author'], entry)
|
||||
|
||||
if 'attachments' in e:
|
||||
entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']]
|
||||
|
||||
return entry
|
||||
|
||||
@staticmethod
|
||||
def parse_author(parent, dest):
|
||||
dest['author_detail'] = detail = FeedParserDict()
|
||||
if 'name' in parent:
|
||||
dest['author'] = detail['name'] = parent['name']
|
||||
if 'url' in parent:
|
||||
if parent['url'].startswith('mailto:'):
|
||||
detail['email'] = parent['url'][7:]
|
||||
else:
|
||||
detail['href'] = parent['url']
|
||||
|
||||
@staticmethod
|
||||
def parse_attachment(attachment):
|
||||
enc = FeedParserDict()
|
||||
enc['href'] = attachment['url']
|
||||
enc['type'] = attachment['mime_type']
|
||||
if 'size_in_bytes' in attachment:
|
||||
enc['length'] = attachment['size_in_bytes']
|
||||
return enc
|
|
@ -1,5 +1,5 @@
|
|||
# The loose feed parser that interfaces with an SGML parsing library
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -26,7 +26,7 @@
|
|||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
class _LooseFeedParser(object):
|
||||
class LooseXMLParser:
|
||||
contentparams = None
|
||||
|
||||
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
|
||||
|
@ -34,7 +34,7 @@ class _LooseFeedParser(object):
|
|||
self.lang = baselang or None
|
||||
self.encoding = encoding or 'utf-8' # character encoding
|
||||
self.entities = entities or {}
|
||||
super(_LooseFeedParser, self).__init__()
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def _normalize_attributes(kv):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# The strict feed parser that interfaces with an XML parsing library
|
||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -29,7 +29,7 @@
|
|||
from ..exceptions import UndeclaredNamespace
|
||||
|
||||
|
||||
class _StrictFeedParser(object):
|
||||
class StrictXMLParser:
|
||||
def __init__(self, baseuri, baselang, encoding):
|
||||
self.bozo = 0
|
||||
self.exc = None
|
||||
|
@ -37,7 +37,7 @@ class _StrictFeedParser(object):
|
|||
self.baseuri = baseuri or ''
|
||||
self.lang = baselang
|
||||
self.encoding = encoding
|
||||
super(_StrictFeedParser, self).__init__()
|
||||
super(StrictXMLParser, self).__init__()
|
||||
|
||||
@staticmethod
|
||||
def _normalize_attributes(kv):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -27,11 +27,11 @@
|
|||
|
||||
import re
|
||||
|
||||
from .html import _BaseHTMLProcessor
|
||||
from .html import BaseHTMLProcessor
|
||||
from .urls import make_safe_absolute_uri
|
||||
|
||||
|
||||
class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||
class HTMLSanitizer(BaseHTMLProcessor):
|
||||
acceptable_elements = {
|
||||
'a',
|
||||
'abbr',
|
||||
|
@ -732,14 +732,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
|||
}
|
||||
|
||||
def __init__(self, encoding=None, _type='application/xhtml+xml'):
|
||||
super(_HTMLSanitizer, self).__init__(encoding, _type)
|
||||
super().__init__(encoding, _type)
|
||||
|
||||
self.unacceptablestack = 0
|
||||
self.mathmlOK = 0
|
||||
self.svgOK = 0
|
||||
|
||||
def reset(self):
|
||||
super(_HTMLSanitizer, self).reset()
|
||||
super().reset()
|
||||
self.unacceptablestack = 0
|
||||
self.mathmlOK = 0
|
||||
self.svgOK = 0
|
||||
|
@ -805,7 +805,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
|||
if key == 'href':
|
||||
value = make_safe_absolute_uri(value)
|
||||
clean_attrs.append((key, value))
|
||||
super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
|
||||
super().unknown_starttag(tag, clean_attrs)
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
if tag not in self.acceptable_elements:
|
||||
|
@ -820,7 +820,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
|||
self.svgOK -= 1
|
||||
else:
|
||||
return
|
||||
super(_HTMLSanitizer, self).unknown_endtag(tag)
|
||||
super().unknown_endtag(tag)
|
||||
|
||||
def handle_pi(self, text):
|
||||
pass
|
||||
|
@ -830,7 +830,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
|||
|
||||
def handle_data(self, text):
|
||||
if not self.unacceptablestack:
|
||||
super(_HTMLSanitizer, self).handle_data(text)
|
||||
super().handle_data(text)
|
||||
|
||||
def sanitize_style(self, style):
|
||||
# disallow urls
|
||||
|
@ -865,7 +865,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
|||
return ' '.join(clean)
|
||||
|
||||
def parse_comment(self, i, report=1):
|
||||
ret = super(_HTMLSanitizer, self).parse_comment(i, report)
|
||||
ret = super().parse_comment(i, report)
|
||||
if ret >= 0:
|
||||
return ret
|
||||
# if ret == -1, this may be a malicious attempt to circumvent
|
||||
|
@ -877,8 +877,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
|||
return len(self.rawdata)
|
||||
|
||||
|
||||
def _sanitize_html(html_source, encoding, _type):
|
||||
p = _HTMLSanitizer(encoding, _type)
|
||||
def sanitize_html(html_source, encoding, _type):
|
||||
p = HTMLSanitizer(encoding, _type)
|
||||
html_source = html_source.replace('<![CDATA[', '<![CDATA[')
|
||||
p.feed(html_source)
|
||||
data = p.output()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -27,7 +27,7 @@
|
|||
|
||||
import re
|
||||
|
||||
import sgmllib
|
||||
import sgmllib # type: ignore[import]
|
||||
|
||||
__all__ = [
|
||||
'sgmllib',
|
||||
|
@ -82,7 +82,7 @@ class _EndBracketRegEx:
|
|||
match = self.endbracket.match(target, index)
|
||||
if match is not None:
|
||||
# Returning a new object in the calling thread's context
|
||||
# resolves a thread-safety.
|
||||
# resolves a thread-safety issue.
|
||||
return EndBracketMatch(match)
|
||||
return None
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -28,7 +28,7 @@
|
|||
import re
|
||||
import urllib.parse
|
||||
|
||||
from .html import _BaseHTMLProcessor
|
||||
from .html import BaseHTMLProcessor
|
||||
|
||||
# If you want feedparser to allow all URL schemes, set this to ()
|
||||
# List culled from Python's urlparse documentation at:
|
||||
|
@ -103,7 +103,7 @@ def make_safe_absolute_uri(base, rel=None):
|
|||
return uri
|
||||
|
||||
|
||||
class RelativeURIResolver(_BaseHTMLProcessor):
|
||||
class RelativeURIResolver(BaseHTMLProcessor):
|
||||
relative_uris = {
|
||||
('a', 'href'),
|
||||
('applet', 'codebase'),
|
||||
|
@ -137,7 +137,7 @@ class RelativeURIResolver(_BaseHTMLProcessor):
|
|||
}
|
||||
|
||||
def __init__(self, baseuri, encoding, _type):
|
||||
_BaseHTMLProcessor.__init__(self, encoding, _type)
|
||||
BaseHTMLProcessor.__init__(self, encoding, _type)
|
||||
self.baseuri = baseuri
|
||||
|
||||
def resolve_uri(self, uri):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||
# Copyright 2002-2008 Mark Pilgrim
|
||||
# All rights reserved.
|
||||
#
|
||||
|
@ -48,7 +48,7 @@ class FeedParserDict(dict):
|
|||
'tagline_detail': 'subtitle_detail',
|
||||
}
|
||||
|
||||
def __getitem__(self, key):
|
||||
def __getitem__(self, key, _stacklevel=2):
|
||||
"""
|
||||
:return: A :class:`FeedParserDict`.
|
||||
"""
|
||||
|
@ -59,9 +59,8 @@ class FeedParserDict(dict):
|
|||
except IndexError:
|
||||
raise KeyError("object doesn't have key 'category'")
|
||||
elif key == 'enclosures':
|
||||
norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
|
||||
return [
|
||||
norel(link)
|
||||
FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
|
||||
for link in dict.__getitem__(self, 'links')
|
||||
if link['rel'] == 'enclosure'
|
||||
]
|
||||
|
@ -84,6 +83,7 @@ class FeedParserDict(dict):
|
|||
"exist. This fallback will be removed in a future version "
|
||||
"of feedparser.",
|
||||
DeprecationWarning,
|
||||
stacklevel=_stacklevel,
|
||||
)
|
||||
return dict.__getitem__(self, 'published')
|
||||
return dict.__getitem__(self, 'updated')
|
||||
|
@ -99,6 +99,7 @@ class FeedParserDict(dict):
|
|||
"`updated_parsed` doesn't exist. This fallback will be "
|
||||
"removed in a future version of feedparser.",
|
||||
DeprecationWarning,
|
||||
stacklevel=_stacklevel,
|
||||
)
|
||||
return dict.__getitem__(self, 'published_parsed')
|
||||
return dict.__getitem__(self, 'updated_parsed')
|
||||
|
@ -119,7 +120,7 @@ class FeedParserDict(dict):
|
|||
# This fix was proposed in issue 328.
|
||||
return dict.__contains__(self, key)
|
||||
try:
|
||||
self.__getitem__(key)
|
||||
self.__getitem__(key, _stacklevel=3)
|
||||
except KeyError:
|
||||
return False
|
||||
else:
|
||||
|
@ -133,7 +134,7 @@ class FeedParserDict(dict):
|
|||
"""
|
||||
|
||||
try:
|
||||
return self.__getitem__(key)
|
||||
return self.__getitem__(key, _stacklevel=3)
|
||||
except KeyError:
|
||||
return default
|
||||
|
||||
|
@ -143,17 +144,11 @@ class FeedParserDict(dict):
|
|||
key = key[0]
|
||||
return dict.__setitem__(self, key, value)
|
||||
|
||||
def setdefault(self, k, default):
|
||||
if k not in self:
|
||||
self[k] = default
|
||||
return default
|
||||
return self[k]
|
||||
|
||||
def __getattr__(self, key):
|
||||
# __getattribute__() is called first; this will be called
|
||||
# only if an attribute was not already found
|
||||
try:
|
||||
return self.__getitem__(key)
|
||||
return self.__getitem__(key, _stacklevel=3)
|
||||
except KeyError:
|
||||
raise AttributeError("object has no attribute '%s'" % key)
|
||||
|
||||
|
|
Loading…
Reference in a new issue