Merge branch 'feature/UpdateFeedparser' into dev

This commit is contained in:
JackDandy 2023-02-09 14:37:12 +00:00
commit b9cfd96e57
32 changed files with 391 additions and 193 deletions

View file

@ -7,6 +7,7 @@
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5) * Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
* Update certifi 2022.09.24 to 2022.12.07 * Update certifi 2022.09.24 to 2022.12.07
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425) * Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb) * Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
* Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078) * Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078)
* Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967) * Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -32,7 +32,7 @@ from .util import FeedParserDict
__author__ = 'Kurt McKee <contactme@kurtmckee.org>' __author__ = 'Kurt McKee <contactme@kurtmckee.org>'
__license__ = 'BSD 2-clause' __license__ = 'BSD 2-clause'
__version__ = '6.0.1' __version__ = '6.0.10'
# HTTP "User-Agent" header to send to servers when downloading feeds. # HTTP "User-Agent" header to send to servers when downloading feeds.
# If you are embedding feedparser in a larger application, you should # If you are embedding feedparser in a larger application, you should

View file

@ -1,5 +1,5 @@
# The public API for feedparser # The public API for feedparser
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,7 +26,11 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
import datetime
import io import io
import time
from typing import Dict, List, Union
import urllib.error
import urllib.parse import urllib.parse
import xml.sax import xml.sax
@ -34,13 +38,12 @@ import sgmllib3k as sgmllib
from .datetimes import registerDateHandler, _parse_date from .datetimes import registerDateHandler, _parse_date
from .encodings import convert_to_utf8 from .encodings import convert_to_utf8
from .exceptions import * from .html import BaseHTMLProcessor
from .html import _BaseHTMLProcessor
from . import http from . import http
from . import mixin from .mixin import XMLParserMixin
from .mixin import _FeedParserMixin from .parsers.loose import LooseXMLParser
from .parsers.loose import _LooseFeedParser from .parsers.strict import StrictXMLParser
from .parsers.strict import _StrictFeedParser from .parsers.json import JSONParser
from .sanitizer import replace_doctype from .sanitizer import replace_doctype
from .urls import convert_to_idn, make_safe_absolute_uri from .urls import convert_to_idn, make_safe_absolute_uri
from .util import FeedParserDict from .util import FeedParserDict
@ -70,6 +73,7 @@ SUPPORTED_VERSIONS = {
'atom10': 'Atom 1.0', 'atom10': 'Atom 1.0',
'atom': 'Atom (unknown version)', 'atom': 'Atom (unknown version)',
'cdf': 'CDF', 'cdf': 'CDF',
'json1': 'JSON feed 1',
} }
@ -136,20 +140,25 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
return url_file_stream_or_string return url_file_stream_or_string
LooseFeedParser = type( class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
'LooseFeedParser', pass
(_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
{},
)
StrictFeedParser = type( class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler):
'StrictFeedParser', pass
(_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
{},
)
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None): def parse(
url_file_stream_or_string,
etag: str = None,
modified: Union[str, datetime.datetime, time.struct_time] = None,
agent: str = None,
referrer: str = None,
handlers: List = None,
request_headers: Dict[str, str] = None,
response_headers: Dict[str, str] = None,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
) -> FeedParserDict:
"""Parse a feed from a URL, file, stream, or string. """Parse a feed from a URL, file, stream, or string.
:param url_file_stream_or_string: :param url_file_stream_or_string:
@ -165,45 +174,46 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
When a URL is not passed the feed location to use in relative URL When a URL is not passed the feed location to use in relative URL
resolution should be passed in the ``Content-Location`` response header resolution should be passed in the ``Content-Location`` response header
(see ``response_headers`` below). (see ``response_headers`` below).
:param etag:
:param str etag: HTTP ``ETag`` request header. HTTP ``ETag`` request header.
:param modified: HTTP ``Last-Modified`` request header. :param modified:
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or HTTP ``Last-Modified`` request header.
:class:`datetime.datetime` :param agent:
:param str agent: HTTP ``User-Agent`` request header, which defaults to HTTP ``User-Agent`` request header, which defaults to
the value of :data:`feedparser.USER_AGENT`. the value of :data:`feedparser.USER_AGENT`.
:param referrer: HTTP ``Referer`` [sic] request header. :param referrer:
HTTP ``Referer`` [sic] request header.
:param handlers:
A list of handlers that will be passed to urllib2.
:param request_headers: :param request_headers:
A mapping of HTTP header name to HTTP header value to add to the A mapping of HTTP header name to HTTP header value to add to the
request, overriding internally generated values. request, overriding internally generated values.
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
:param response_headers: :param response_headers:
A mapping of HTTP header name to HTTP header value. Multiple values may A mapping of HTTP header name to HTTP header value. Multiple values may
be joined with a comma. If a HTTP request was made, these headers be joined with a comma. If a HTTP request was made, these headers
override any matching headers in the response. Otherwise this specifies override any matching headers in the response. Otherwise this specifies
the entirety of the response headers. the entirety of the response headers.
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str` :param resolve_relative_uris:
:param bool resolve_relative_uris:
Should feedparser attempt to resolve relative URIs absolute ones within Should feedparser attempt to resolve relative URIs absolute ones within
HTML content? Defaults to the value of HTML content? Defaults to the value of
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``. :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
:param bool sanitize_html: :param sanitize_html:
Should feedparser skip HTML sanitization? Only disable this if you know Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``. :data:`feedparser.SANITIZE_HTML`, which is ``True``.
:return: A :class:`FeedParserDict`.
""" """
if not agent or sanitize_html is None or resolve_relative_uris is None: # Avoid a cyclic import.
import feedparser
if not agent: if not agent:
import feedparser
agent = feedparser.USER_AGENT agent = feedparser.USER_AGENT
if sanitize_html is None: if sanitize_html is None:
sanitize_html = feedparser.SANITIZE_HTML import feedparser
sanitize_html = bool(feedparser.SANITIZE_HTML)
if resolve_relative_uris is None: if resolve_relative_uris is None:
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS import feedparser
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
result = FeedParserDict( result = FeedParserDict(
bozo=False, bozo=False,
@ -212,7 +222,14 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
headers={}, headers={},
) )
try:
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
except urllib.error.URLError as error:
result.update({
'bozo': True,
'bozo_exception': error,
})
return result
if not data: if not data:
return result return result
@ -221,8 +238,10 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
result['headers'].update(response_headers or {}) result['headers'].update(response_headers or {})
data = convert_to_utf8(result['headers'], data, result) data = convert_to_utf8(result['headers'], data, result)
use_json_parser = result['content-type'] == 'application/json'
use_strict_parser = result['encoding'] and True or False use_strict_parser = result['encoding'] and True or False
if not use_json_parser:
result['version'], data, entities = replace_doctype(data) result['version'], data, entities = replace_doctype(data)
# Ensure that baseuri is an absolute URI using an acceptable URI scheme. # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
@ -235,36 +254,52 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
baselang = baselang.decode('utf-8', 'ignore') baselang = baselang.decode('utf-8', 'ignore')
if not _XML_AVAILABLE: if not _XML_AVAILABLE:
use_strict_parser = 0 use_strict_parser = False
if use_strict_parser: feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
# initialize the SAX parser if use_json_parser:
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8') result['version'] = None
feedparser.resolve_relative_uris = resolve_relative_uris feed_parser = JSONParser(baseuri, baselang, 'utf-8')
feedparser.sanitize_html = sanitize_html try:
feed_parser.feed(data)
except Exception as e:
result['bozo'] = 1
result['bozo_exception'] = e
elif use_strict_parser:
# Initialize the SAX parser.
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
try: try:
# disable downloading external doctype references, if possible # Disable downloading external doctype references, if possible.
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
except xml.sax.SAXNotSupportedException: except xml.sax.SAXNotSupportedException:
pass pass
saxparser.setContentHandler(feedparser) saxparser.setContentHandler(feed_parser)
saxparser.setErrorHandler(feedparser) saxparser.setErrorHandler(feed_parser)
source = xml.sax.xmlreader.InputSource() source = xml.sax.xmlreader.InputSource()
source.setByteStream(io.BytesIO(data)) source.setByteStream(io.BytesIO(data))
try: try:
saxparser.parse(source) saxparser.parse(source)
except xml.sax.SAXException as e: except xml.sax.SAXException as e:
result['bozo'] = 1 result['bozo'] = 1
result['bozo_exception'] = feedparser.exc or e result['bozo_exception'] = feed_parser.exc or e
use_strict_parser = 0 use_strict_parser = False
if not use_strict_parser:
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) # The loose XML parser will be tried if the JSON parser was not used,
feedparser.resolve_relative_uris = resolve_relative_uris # and if the strict XML parser was not used (or if it failed).
feedparser.sanitize_html = sanitize_html if not use_json_parser and not use_strict_parser:
feedparser.feed(data.decode('utf-8', 'replace')) feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
result['feed'] = feedparser.feeddata feed_parser.resolve_relative_uris = resolve_relative_uris
result['entries'] = feedparser.entries feed_parser.sanitize_html = sanitize_html
result['version'] = result['version'] or feedparser.version feed_parser.feed(data.decode('utf-8', 'replace'))
result['namespaces'] = feedparser.namespaces_in_use
result['feed'] = feed_parser.feeddata
result['entries'] = feed_parser.entries
result['version'] = result['version'] or feed_parser.version
if isinstance(feed_parser, JSONParser):
result['namespaces'] = {}
else:
result['namespaces'] = feed_parser.namespaces_in_use
return result return result

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -25,6 +25,8 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from time import struct_time
from typing import Callable, List, Optional
from .asctime import _parse_date_asctime from .asctime import _parse_date_asctime
from .greek import _parse_date_greek from .greek import _parse_date_greek
from .hungarian import _parse_date_hungarian from .hungarian import _parse_date_hungarian
@ -34,7 +36,7 @@ from .perforce import _parse_date_perforce
from .rfc822 import _parse_date_rfc822 from .rfc822 import _parse_date_rfc822
from .w3dtf import _parse_date_w3dtf from .w3dtf import _parse_date_w3dtf
_date_handlers = [] _date_handlers: List[Callable[[str], Optional[struct_time]]] = []
def registerDateHandler(func): def registerDateHandler(func):

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -68,15 +68,7 @@ _iso8601_re = [
+ r'(\.(?P<fracsecond>\d+))?' + r'(\.(?P<fracsecond>\d+))?'
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
for tmpl in _iso8601_tmpl] for tmpl in _iso8601_tmpl]
try:
del tmpl
except NameError:
pass
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
try:
del regex
except NameError:
pass
def _parse_date_iso8601(date_string): def _parse_date_iso8601(date_string):

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -25,7 +25,7 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
import email._parseaddr import email.utils
import re import re
import time import time
@ -41,6 +41,6 @@ def _parse_date_perforce(date_string):
dow, year, month, day, hour, minute, second, tz = m.groups() dow, year, month, day, hour, minute, second, tz = m.groups()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
tm = email._parseaddr.parsedate_tz(new_date_string) tm = email.utils.parsedate_tz(new_date_string)
if tm: if tm:
return time.gmtime(email._parseaddr.mktime_tz(tm)) return time.gmtime(email.utils.mktime_tz(tm))

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,5 +1,5 @@
# Character encoding routines # Character encoding routines
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,17 +26,16 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
import cgi
import codecs import codecs
import re import re
import typing as t
try: try:
try: try:
import cchardet as chardet import cchardet as chardet # type: ignore[import]
except ImportError: except ImportError:
import chardet import chardet # type: ignore[no-redef]
except ImportError: except ImportError:
chardet = None
lazy_chardet_encoding = None lazy_chardet_encoding = None
else: else:
def lazy_chardet_encoding(data): def lazy_chardet_encoding(data):
@ -68,6 +67,30 @@ RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>')
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
def parse_content_type(line: str) -> t.Tuple[str, str]:
"""Parse an HTTP Content-Type header.
The return value will be a tuple of strings:
the MIME type, and the value of the "charset" (if any).
This is a custom replacement for Python's cgi.parse_header().
The cgi module will be removed in Python 3.13.
"""
chunks = line.split(";")
if not chunks:
return "", ""
mime_type = chunks[0].strip()
charset_value = ""
for chunk in chunks[1:]:
key, _, value = chunk.partition("=")
if key.strip().lower() == "charset":
charset_value = value.strip().strip("\"'")
return mime_type, charset_value
def convert_to_utf8(http_headers, data, result): def convert_to_utf8(http_headers, data, result):
"""Detect and convert the character encoding to UTF-8. """Detect and convert the character encoding to UTF-8.
@ -156,10 +179,7 @@ def convert_to_utf8(http_headers, data, result):
try: try:
if bom_encoding: if bom_encoding:
tempdata = data.decode(bom_encoding).encode('utf-8') tempdata = data.decode(bom_encoding).encode('utf-8')
except (UnicodeDecodeError, LookupError): except UnicodeDecodeError:
# feedparser recognizes UTF-32 encodings that aren't
# available in Python 2.4 and 2.5, so it's possible to
# encounter a LookupError during decoding.
xml_encoding_match = None xml_encoding_match = None
else: else:
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
@ -181,15 +201,14 @@ def convert_to_utf8(http_headers, data, result):
# XML declaration encoding, and HTTP encoding, following the # XML declaration encoding, and HTTP encoding, following the
# heuristic defined in RFC 3023. # heuristic defined in RFC 3023.
http_content_type = http_headers.get('content-type') or '' http_content_type = http_headers.get('content-type') or ''
http_content_type, params = cgi.parse_header(http_content_type) http_content_type, http_encoding = parse_content_type(http_content_type)
http_encoding = params.get('charset', '').replace("'", "")
if isinstance(http_encoding, bytes):
http_encoding = http_encoding.decode('utf-8', 'ignore')
acceptable_content_type = 0 acceptable_content_type = 0
application_content_types = ('application/xml', 'application/xml-dtd', application_content_types = ('application/xml', 'application/xml-dtd',
'application/xml-external-parsed-entity') 'application/xml-external-parsed-entity')
text_content_types = ('text/xml', 'text/xml-external-parsed-entity') text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
json_content_types = ('application/feed+json', 'application/json')
json = False
if ( if (
http_content_type in application_content_types http_content_type in application_content_types
or ( or (
@ -208,6 +227,17 @@ def convert_to_utf8(http_headers, data, result):
): ):
acceptable_content_type = 1 acceptable_content_type = 1
rfc3023_encoding = http_encoding or 'us-ascii' rfc3023_encoding = http_encoding or 'us-ascii'
elif (
http_content_type in json_content_types
or (
not http_content_type
and data and data.lstrip()[0] == '{'
)
):
http_content_type = json_content_types[0]
acceptable_content_type = 1
json = True
rfc3023_encoding = http_encoding or 'utf-8' # RFC 7159, 8.1.
elif http_content_type.startswith('text/'): elif http_content_type.startswith('text/'):
rfc3023_encoding = http_encoding or 'us-ascii' rfc3023_encoding = http_encoding or 'us-ascii'
elif http_headers and 'content-type' not in http_headers: elif http_headers and 'content-type' not in http_headers:
@ -230,7 +260,7 @@ def convert_to_utf8(http_headers, data, result):
if http_headers and (not acceptable_content_type): if http_headers and (not acceptable_content_type):
if 'content-type' in http_headers: if 'content-type' in http_headers:
msg = '%s is not an XML media type' % http_headers['content-type'] msg = '%s is not an accepted media type' % http_headers['content-type']
else: else:
msg = 'no Content-type specified' msg = 'no Content-type specified'
error = NonXMLContentType(msg) error = NonXMLContentType(msg)
@ -254,6 +284,7 @@ def convert_to_utf8(http_headers, data, result):
pass pass
else: else:
known_encoding = 1 known_encoding = 1
if not json:
# Update the encoding in the opening XML processing instruction. # Update the encoding in the opening XML processing instruction.
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>''' new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
if RE_XML_DECLARATION.search(data): if RE_XML_DECLARATION.search(data):
@ -275,6 +306,7 @@ def convert_to_utf8(http_headers, data, result):
(rfc3023_encoding, proposed_encoding)) (rfc3023_encoding, proposed_encoding))
rfc3023_encoding = proposed_encoding rfc3023_encoding = proposed_encoding
result['content-type'] = http_content_type # for selecting the parser
result['encoding'] = rfc3023_encoding result['encoding'] = rfc3023_encoding
if error: if error:
result['bozo'] = True result['bozo'] = True

View file

@ -1,5 +1,5 @@
# Exceptions used throughout feedparser # Exceptions used throughout feedparser
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,7 +27,7 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
__all__ = [ __all__ = [
'ThingsNobodyCaresAboutButMe', 'FeedparserError',
'CharacterEncodingOverride', 'CharacterEncodingOverride',
'CharacterEncodingUnknown', 'CharacterEncodingUnknown',
'NonXMLContentType', 'NonXMLContentType',
@ -35,19 +35,19 @@ __all__ = [
] ]
class ThingsNobodyCaresAboutButMe(Exception): class FeedparserError(Exception):
pass pass
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): class CharacterEncodingOverride(FeedparserError):
pass pass
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): class CharacterEncodingUnknown(FeedparserError):
pass pass
class NonXMLContentType(ThingsNobodyCaresAboutButMe): class NonXMLContentType(FeedparserError):
pass pass

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -61,7 +61,7 @@ _cp1252 = {
} }
class _BaseHTMLProcessor(sgmllib.SGMLParser, object): class BaseHTMLProcessor(sgmllib.SGMLParser):
special = re.compile("""[<>'"]""") special = re.compile("""[<>'"]""")
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
elements_no_end_tag = { elements_no_end_tag = {
@ -91,11 +91,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
self.encoding = encoding self.encoding = encoding
self._type = _type self._type = _type
self.pieces = [] self.pieces = []
super(_BaseHTMLProcessor, self).__init__() super().__init__()
def reset(self): def reset(self):
self.pieces = [] self.pieces = []
super(_BaseHTMLProcessor, self).reset() super().reset()
def _shorttag_replace(self, match): def _shorttag_replace(self, match):
""" """
@ -118,23 +118,13 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
raise NotImplementedError raise NotImplementedError
# Replace goahead with SGMLParser's goahead() code object. # Replace goahead with SGMLParser's goahead() code object.
try:
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
except AttributeError:
# Python 2
# noinspection PyUnresolvedReferences
goahead.func_code = sgmllib.SGMLParser.goahead.func_code
def __parse_starttag(self, i): def __parse_starttag(self, i):
raise NotImplementedError raise NotImplementedError
# Replace __parse_starttag with SGMLParser's parse_starttag() code object. # Replace __parse_starttag with SGMLParser's parse_starttag() code object.
try:
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
except AttributeError:
# Python 2
# noinspection PyUnresolvedReferences
__parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
def parse_starttag(self, i): def parse_starttag(self, i):
j = self.__parse_starttag(i) j = self.__parse_starttag(i)
@ -153,8 +143,8 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
data = data.replace('&#39;', "'") data = data.replace('&#39;', "'")
data = data.replace('&#34;', '"') data = data.replace('&#34;', '"')
super(_BaseHTMLProcessor, self).feed(data) super().feed(data)
super(_BaseHTMLProcessor, self).close() super().close()
@staticmethod @staticmethod
def normalize_attrs(attrs): def normalize_attrs(attrs):
@ -315,8 +305,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
# self.updatepos(declstartpos, i) # self.updatepos(declstartpos, i)
return None, -1 return None, -1
@staticmethod def convert_charref(self, name):
def convert_charref(name):
""" """
:type name: str :type name: str
:rtype: str :rtype: str
@ -324,8 +313,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
return '&#%s;' % name return '&#%s;' % name
@staticmethod def convert_entityref(self, name):
def convert_entityref(name):
""" """
:type name: str :type name: str
:rtype: str :rtype: str
@ -349,7 +337,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
try: try:
return sgmllib.SGMLParser.parse_declaration(self, i) return sgmllib.SGMLParser.parse_declaration(self, i)
except sgmllib.SGMLParseError: except (AssertionError, sgmllib.SGMLParseError):
# Escape the doctype declaration and continue parsing. # Escape the doctype declaration and continue parsing.
self.handle_data('&lt;') self.handle_data('&lt;')
return i+1 return i+1

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -44,7 +44,7 @@ from .urls import convert_to_idn
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler): class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, headers): def http_error_default(self, req, fp, code, msg, headers):
# The default implementation just raises HTTPError. # The default implementation just raises HTTPError.
# Forget that. # Forget that.
@ -53,6 +53,8 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
def http_error_301(self, req, fp, code, msg, hdrs): def http_error_301(self, req, fp, code, msg, hdrs):
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs) result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
if not result:
return fp
result.status = code result.status = code
result.newurl = result.geturl() result.newurl = result.geturl()
return result return result
@ -78,7 +80,7 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
host = urllib.parse.urlparse(req.get_full_url())[1] host = urllib.parse.urlparse(req.get_full_url())[1]
if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers: if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
return self.http_error_default(req, fp, code, msg, headers) return self.http_error_default(req, fp, code, msg, headers)
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode('utf8')) auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
user, passw = auth.split(':') user, passw = auth.split(':')
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
self.add_password(realm, host, user, passw) self.add_password(realm, host, user, passw)
@ -145,15 +147,26 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
if url_pieces.port: if url_pieces.port:
new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}' new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
url = urllib.parse.urlunparse(new_pieces) url = urllib.parse.urlunparse(new_pieces)
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}').strip() auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
# iri support # iri support
if not isinstance(url, bytes): if not isinstance(url, bytes):
url = convert_to_idn(url) url = convert_to_idn(url)
# Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
bits = []
for c in url:
try:
c.encode('ascii')
except UnicodeEncodeError:
bits.append(urllib.parse.quote(c))
else:
bits.append(c)
url = ''.join(bits)
# try to open with urllib2 (to use optional headers) # try to open with urllib2 (to use optional headers)
request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers) request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()])) opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
f = opener.open(request) f = opener.open(request)
data = f.read() data = f.read()
@ -203,7 +216,7 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
result['href'] = f.url.decode('utf-8', 'ignore') result['href'] = f.url.decode('utf-8', 'ignore')
else: else:
result['href'] = f.url result['href'] = f.url
result['status'] = getattr(f, 'status', 200) result['status'] = getattr(f, 'status', None) or 200
# Stop processing if the server sent HTTP 304 Not Modified. # Stop processing if the server sent HTTP 304 Not Modified.
if getattr(f, 'code', 0) == 304: if getattr(f, 'code', 0) == 304:

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -30,16 +30,17 @@ import binascii
import copy import copy
import html.entities import html.entities
import re import re
from typing import Dict
import xml.sax.saxutils import xml.sax.saxutils
from .html import _cp1252 from .html import _cp1252
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
from .sanitizer import _sanitize_html, _HTMLSanitizer from .sanitizer import sanitize_html, HTMLSanitizer
from .util import FeedParserDict from .util import FeedParserDict
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
class _FeedParserMixin( class XMLParserMixin(
_base.Namespace, _base.Namespace,
cc.Namespace, cc.Namespace,
dc.Namespace, dc.Namespace,
@ -118,7 +119,7 @@ class _FeedParserMixin(
'http://www.w3.org/XML/1998/namespace': 'xml', 'http://www.w3.org/XML/1998/namespace': 'xml',
'http://podlove.org/simple-chapters': 'psc', 'http://podlove.org/simple-chapters': 'psc',
} }
_matchnamespaces = {} _matchnamespaces: Dict[str, str] = {}
can_be_relative_uri = { can_be_relative_uri = {
'comments', 'comments',
@ -170,6 +171,8 @@ class _FeedParserMixin(
self.entries = [] # list of entry-level data self.entries = [] # list of entry-level data
self.version = '' # feed type/version, see SUPPORTED_VERSIONS self.version = '' # feed type/version, see SUPPORTED_VERSIONS
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
self.resolve_relative_uris = False
self.sanitize_html = False
# the following are used internally to track state; # the following are used internally to track state;
# this is really out of control and should be refactored # this is really out of control and should be refactored
@ -193,6 +196,7 @@ class _FeedParserMixin(
self.svgOK = 0 self.svgOK = 0
self.title_depth = -1 self.title_depth = -1
self.depth = 0 self.depth = 0
self.hasContent = 0
if self.lang: if self.lang:
self.feeddata['language'] = self.lang.replace('_', '-') self.feeddata['language'] = self.lang.replace('_', '-')
@ -204,7 +208,7 @@ class _FeedParserMixin(
# }, # },
# } # }
self.property_depth_map = {} self.property_depth_map = {}
super(_FeedParserMixin, self).__init__() super(XMLParserMixin, self).__init__()
def _normalize_attributes(self, kv): def _normalize_attributes(self, kv):
raise NotImplementedError raise NotImplementedError
@ -506,9 +510,7 @@ class _FeedParserMixin(
if base64 and self.contentparams.get('base64', 0): if base64 and self.contentparams.get('base64', 0):
try: try:
output = base64.decodebytes(output.encode('utf8')).decode('utf8') output = base64.decodebytes(output.encode('utf8')).decode('utf8')
except binascii.Error: except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
pass
except binascii.Incomplete:
pass pass
# resolve relative URIs # resolve relative URIs
@ -546,7 +548,7 @@ class _FeedParserMixin(
# sanitize embedded markup # sanitize embedded markup
if is_htmlish and self.sanitize_html: if is_htmlish and self.sanitize_html:
if element in self.can_contain_dangerous_markup: if element in self.can_contain_dangerous_markup:
output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html')) output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
if self.encoding and isinstance(output, bytes): if self.encoding and isinstance(output, bytes):
output = output.decode(self.encoding, 'ignore') output = output.decode(self.encoding, 'ignore')
@ -648,7 +650,7 @@ class _FeedParserMixin(
return False return False
# all tags must be in a restricted subset of valid HTML tags # all tags must be in a restricted subset of valid HTML tags
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in _HTMLSanitizer.acceptable_elements)): if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in HTMLSanitizer.acceptable_elements)):
return False return False
# all entities must have been defined as valid HTML entities # all entities must have been defined as valid HTML entities
@ -744,7 +746,7 @@ class _FeedParserMixin(
author, email = context.get(key), None author, email = context.get(key), None
if not author: if not author:
return return
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) emailmatch = re.search(r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))(\?subject=\S+)?", author)
if emailmatch: if emailmatch:
email = emailmatch.group(0) email = emailmatch.group(0)
# probably a better way to do the following, but it passes # probably a better way to do the following, but it passes

View file

@ -1,5 +1,5 @@
# Support for the Atom, RSS, RDF, and CDF feed formats # Support for the Atom, RSS, RDF, and CDF feed formats
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -259,6 +259,7 @@ class Namespace(object):
def _end_item(self): def _end_item(self):
self.pop('item') self.pop('item')
self.inentry = 0 self.inentry = 0
self.hasContent = 0
_end_entry = _end_item _end_entry = _end_item
def _start_language(self, attrs_d): def _start_language(self, attrs_d):
@ -388,7 +389,7 @@ class Namespace(object):
def _start_description(self, attrs_d): def _start_description(self, attrs_d):
context = self._get_context() context = self._get_context()
if 'summary' in context: if 'summary' in context and not self.hasContent:
self._summaryKey = 'content' self._summaryKey = 'content'
self._start_content(attrs_d) self._start_content(attrs_d)
else: else:
@ -429,7 +430,7 @@ class Namespace(object):
def _start_summary(self, attrs_d): def _start_summary(self, attrs_d):
context = self._get_context() context = self._get_context()
if 'summary' in context: if 'summary' in context and not self.hasContent:
self._summaryKey = 'content' self._summaryKey = 'content'
self._start_content(attrs_d) self._start_content(attrs_d)
else: else:
@ -466,6 +467,7 @@ class Namespace(object):
self.sourcedata.clear() self.sourcedata.clear()
def _start_content(self, attrs_d): def _start_content(self, attrs_d):
self.hasContent = 1
self.push_content('content', attrs_d, 'text/plain', 1) self.push_content('content', attrs_d, 'text/plain', 1)
src = attrs_d.get('src') src = attrs_d.get('src')
if src: if src:
@ -477,6 +479,7 @@ class Namespace(object):
_start_xhtml_body = _start_body _start_xhtml_body = _start_body
def _start_content_encoded(self, attrs_d): def _start_content_encoded(self, attrs_d):
self.hasContent = 1
self.push_content('content', attrs_d, 'text/html', 1) self.push_content('content', attrs_d, 'text/html', 1)
_start_fullitem = _start_content_encoded _start_fullitem = _start_content_encoded

View file

@ -1,5 +1,5 @@
# Support for the administrative elements extension # Support for the administrative elements extension
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,5 +1,5 @@
# Support for the Creative Commons licensing extensions # Support for the Creative Commons licensing extensions
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,5 +1,5 @@
# Support for the Dublin Core metadata extensions # Support for the Dublin Core metadata extensions
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,5 +1,5 @@
# Support for the GeoRSS format # Support for the GeoRSS format
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -91,6 +91,8 @@ class Namespace(object):
except ValueError: except ValueError:
srs_dimension = 2 srs_dimension = 2
context = self._get_context() context = self._get_context()
if 'where' not in context:
context['where'] = {}
context['where']['srsName'] = srs_name context['where']['srsName'] = srs_name
context['where']['srsDimension'] = srs_dimension context['where']['srsDimension'] = srs_dimension

View file

@ -1,5 +1,5 @@
# Support for the iTunes format # Support for the iTunes format
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,5 +1,5 @@
# Support for the Media RSS format # Support for the Media RSS format
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -1,5 +1,5 @@
# Support for the Podlove Simple Chapters format # Support for the Podlove Simple Chapters format
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #

View file

@ -0,0 +1,133 @@
# The JSON feed parser
# Copyright 2017 Beat Bolli
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import json
from ..datetimes import _parse_date
from ..sanitizer import sanitize_html
from ..util import FeedParserDict
class JSONParser:
VERSIONS = {
'https://jsonfeed.org/version/1': 'json1',
'https://jsonfeed.org/version/1.1': 'json11',
}
FEED_FIELDS = (
('title', 'title'),
('icon', 'image'),
('home_page_url', 'link'),
('description', 'description'),
)
ITEM_FIELDS = (
('title', 'title'),
('id', 'guid'),
('url', 'link'),
('summary', 'summary'),
('external_url', 'source'),
)
def __init__(self, baseuri=None, baselang=None, encoding=None):
self.baseuri = baseuri or ''
self.lang = baselang or None
self.encoding = encoding or 'utf-8' # character encoding
self.version = None
self.feeddata = FeedParserDict()
self.namespacesInUse = []
self.entries = []
def feed(self, data):
data = json.loads(data)
v = data.get('version', '')
try:
self.version = self.VERSIONS[v]
except KeyError:
raise ValueError("Unrecognized JSONFeed version '%s'" % v)
for src, dst in self.FEED_FIELDS:
if src in data:
self.feeddata[dst] = data[src]
if 'author' in data:
self.parse_author(data['author'], self.feeddata)
# TODO: hubs; expired has no RSS equivalent
self.entries = [self.parse_entry(e) for e in data['items']]
def parse_entry(self, e):
entry = FeedParserDict()
for src, dst in self.ITEM_FIELDS:
if src in e:
entry[dst] = e[src]
if 'content_text' in e:
entry['content'] = c = FeedParserDict()
c['value'] = e['content_text']
c['type'] = 'text'
elif 'content_html' in e:
entry['content'] = c = FeedParserDict()
c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json')
c['type'] = 'html'
if 'date_published' in e:
entry['published'] = e['date_published']
entry['published_parsed'] = _parse_date(e['date_published'])
if 'date_updated' in e:
entry['updated'] = e['date_modified']
entry['updated_parsed'] = _parse_date(e['date_modified'])
if 'tags' in e:
entry['category'] = e['tags']
if 'author' in e:
self.parse_author(e['author'], entry)
if 'attachments' in e:
entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']]
return entry
@staticmethod
def parse_author(parent, dest):
dest['author_detail'] = detail = FeedParserDict()
if 'name' in parent:
dest['author'] = detail['name'] = parent['name']
if 'url' in parent:
if parent['url'].startswith('mailto:'):
detail['email'] = parent['url'][7:]
else:
detail['href'] = parent['url']
@staticmethod
def parse_attachment(attachment):
enc = FeedParserDict()
enc['href'] = attachment['url']
enc['type'] = attachment['mime_type']
if 'size_in_bytes' in attachment:
enc['length'] = attachment['size_in_bytes']
return enc

View file

@ -1,5 +1,5 @@
# The loose feed parser that interfaces with an SGML parsing library # The loose feed parser that interfaces with an SGML parsing library
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,7 +26,7 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
class _LooseFeedParser(object): class LooseXMLParser:
contentparams = None contentparams = None
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None): def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
@ -34,7 +34,7 @@ class _LooseFeedParser(object):
self.lang = baselang or None self.lang = baselang or None
self.encoding = encoding or 'utf-8' # character encoding self.encoding = encoding or 'utf-8' # character encoding
self.entities = entities or {} self.entities = entities or {}
super(_LooseFeedParser, self).__init__() super().__init__()
@staticmethod @staticmethod
def _normalize_attributes(kv): def _normalize_attributes(kv):

View file

@ -1,5 +1,5 @@
# The strict feed parser that interfaces with an XML parsing library # The strict feed parser that interfaces with an XML parsing library
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -29,7 +29,7 @@
from ..exceptions import UndeclaredNamespace from ..exceptions import UndeclaredNamespace
class _StrictFeedParser(object): class StrictXMLParser:
def __init__(self, baseuri, baselang, encoding): def __init__(self, baseuri, baselang, encoding):
self.bozo = 0 self.bozo = 0
self.exc = None self.exc = None
@ -37,7 +37,7 @@ class _StrictFeedParser(object):
self.baseuri = baseuri or '' self.baseuri = baseuri or ''
self.lang = baselang self.lang = baselang
self.encoding = encoding self.encoding = encoding
super(_StrictFeedParser, self).__init__() super(StrictXMLParser, self).__init__()
@staticmethod @staticmethod
def _normalize_attributes(kv): def _normalize_attributes(kv):

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,11 +27,11 @@
import re import re
from .html import _BaseHTMLProcessor from .html import BaseHTMLProcessor
from .urls import make_safe_absolute_uri from .urls import make_safe_absolute_uri
class _HTMLSanitizer(_BaseHTMLProcessor): class HTMLSanitizer(BaseHTMLProcessor):
acceptable_elements = { acceptable_elements = {
'a', 'a',
'abbr', 'abbr',
@ -732,14 +732,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
} }
def __init__(self, encoding=None, _type='application/xhtml+xml'): def __init__(self, encoding=None, _type='application/xhtml+xml'):
super(_HTMLSanitizer, self).__init__(encoding, _type) super().__init__(encoding, _type)
self.unacceptablestack = 0 self.unacceptablestack = 0
self.mathmlOK = 0 self.mathmlOK = 0
self.svgOK = 0 self.svgOK = 0
def reset(self): def reset(self):
super(_HTMLSanitizer, self).reset() super().reset()
self.unacceptablestack = 0 self.unacceptablestack = 0
self.mathmlOK = 0 self.mathmlOK = 0
self.svgOK = 0 self.svgOK = 0
@ -805,7 +805,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
if key == 'href': if key == 'href':
value = make_safe_absolute_uri(value) value = make_safe_absolute_uri(value)
clean_attrs.append((key, value)) clean_attrs.append((key, value))
super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs) super().unknown_starttag(tag, clean_attrs)
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
if tag not in self.acceptable_elements: if tag not in self.acceptable_elements:
@ -820,7 +820,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
self.svgOK -= 1 self.svgOK -= 1
else: else:
return return
super(_HTMLSanitizer, self).unknown_endtag(tag) super().unknown_endtag(tag)
def handle_pi(self, text): def handle_pi(self, text):
pass pass
@ -830,7 +830,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
def handle_data(self, text): def handle_data(self, text):
if not self.unacceptablestack: if not self.unacceptablestack:
super(_HTMLSanitizer, self).handle_data(text) super().handle_data(text)
def sanitize_style(self, style): def sanitize_style(self, style):
# disallow urls # disallow urls
@ -865,7 +865,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
return ' '.join(clean) return ' '.join(clean)
def parse_comment(self, i, report=1): def parse_comment(self, i, report=1):
ret = super(_HTMLSanitizer, self).parse_comment(i, report) ret = super().parse_comment(i, report)
if ret >= 0: if ret >= 0:
return ret return ret
# if ret == -1, this may be a malicious attempt to circumvent # if ret == -1, this may be a malicious attempt to circumvent
@ -877,8 +877,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
return len(self.rawdata) return len(self.rawdata)
def _sanitize_html(html_source, encoding, _type): def sanitize_html(html_source, encoding, _type):
p = _HTMLSanitizer(encoding, _type) p = HTMLSanitizer(encoding, _type)
html_source = html_source.replace('<![CDATA[', '&lt;![CDATA[') html_source = html_source.replace('<![CDATA[', '&lt;![CDATA[')
p.feed(html_source) p.feed(html_source)
data = p.output() data = p.output()

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,7 +27,7 @@
import re import re
import sgmllib import sgmllib # type: ignore[import]
__all__ = [ __all__ = [
'sgmllib', 'sgmllib',
@ -82,7 +82,7 @@ class _EndBracketRegEx:
match = self.endbracket.match(target, index) match = self.endbracket.match(target, index)
if match is not None: if match is not None:
# Returning a new object in the calling thread's context # Returning a new object in the calling thread's context
# resolves a thread-safety. # resolves a thread-safety issue.
return EndBracketMatch(match) return EndBracketMatch(match)
return None return None

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -28,7 +28,7 @@
import re import re
import urllib.parse import urllib.parse
from .html import _BaseHTMLProcessor from .html import BaseHTMLProcessor
# If you want feedparser to allow all URL schemes, set this to () # If you want feedparser to allow all URL schemes, set this to ()
# List culled from Python's urlparse documentation at: # List culled from Python's urlparse documentation at:
@ -103,7 +103,7 @@ def make_safe_absolute_uri(base, rel=None):
return uri return uri
class RelativeURIResolver(_BaseHTMLProcessor): class RelativeURIResolver(BaseHTMLProcessor):
relative_uris = { relative_uris = {
('a', 'href'), ('a', 'href'),
('applet', 'codebase'), ('applet', 'codebase'),
@ -137,7 +137,7 @@ class RelativeURIResolver(_BaseHTMLProcessor):
} }
def __init__(self, baseuri, encoding, _type): def __init__(self, baseuri, encoding, _type):
_BaseHTMLProcessor.__init__(self, encoding, _type) BaseHTMLProcessor.__init__(self, encoding, _type)
self.baseuri = baseuri self.baseuri = baseuri
def resolve_uri(self, uri): def resolve_uri(self, uri):

View file

@ -1,4 +1,4 @@
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -48,7 +48,7 @@ class FeedParserDict(dict):
'tagline_detail': 'subtitle_detail', 'tagline_detail': 'subtitle_detail',
} }
def __getitem__(self, key): def __getitem__(self, key, _stacklevel=2):
""" """
:return: A :class:`FeedParserDict`. :return: A :class:`FeedParserDict`.
""" """
@ -59,9 +59,8 @@ class FeedParserDict(dict):
except IndexError: except IndexError:
raise KeyError("object doesn't have key 'category'") raise KeyError("object doesn't have key 'category'")
elif key == 'enclosures': elif key == 'enclosures':
norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
return [ return [
norel(link) FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
for link in dict.__getitem__(self, 'links') for link in dict.__getitem__(self, 'links')
if link['rel'] == 'enclosure' if link['rel'] == 'enclosure'
] ]
@ -84,6 +83,7 @@ class FeedParserDict(dict):
"exist. This fallback will be removed in a future version " "exist. This fallback will be removed in a future version "
"of feedparser.", "of feedparser.",
DeprecationWarning, DeprecationWarning,
stacklevel=_stacklevel,
) )
return dict.__getitem__(self, 'published') return dict.__getitem__(self, 'published')
return dict.__getitem__(self, 'updated') return dict.__getitem__(self, 'updated')
@ -99,6 +99,7 @@ class FeedParserDict(dict):
"`updated_parsed` doesn't exist. This fallback will be " "`updated_parsed` doesn't exist. This fallback will be "
"removed in a future version of feedparser.", "removed in a future version of feedparser.",
DeprecationWarning, DeprecationWarning,
stacklevel=_stacklevel,
) )
return dict.__getitem__(self, 'published_parsed') return dict.__getitem__(self, 'published_parsed')
return dict.__getitem__(self, 'updated_parsed') return dict.__getitem__(self, 'updated_parsed')
@ -119,7 +120,7 @@ class FeedParserDict(dict):
# This fix was proposed in issue 328. # This fix was proposed in issue 328.
return dict.__contains__(self, key) return dict.__contains__(self, key)
try: try:
self.__getitem__(key) self.__getitem__(key, _stacklevel=3)
except KeyError: except KeyError:
return False return False
else: else:
@ -133,7 +134,7 @@ class FeedParserDict(dict):
""" """
try: try:
return self.__getitem__(key) return self.__getitem__(key, _stacklevel=3)
except KeyError: except KeyError:
return default return default
@ -143,17 +144,11 @@ class FeedParserDict(dict):
key = key[0] key = key[0]
return dict.__setitem__(self, key, value) return dict.__setitem__(self, key, value)
def setdefault(self, k, default):
if k not in self:
self[k] = default
return default
return self[k]
def __getattr__(self, key): def __getattr__(self, key):
# __getattribute__() is called first; this will be called # __getattribute__() is called first; this will be called
# only if an attribute was not already found # only if an attribute was not already found
try: try:
return self.__getitem__(key) return self.__getitem__(key, _stacklevel=3)
except KeyError: except KeyError:
raise AttributeError("object has no attribute '%s'" % key) raise AttributeError("object has no attribute '%s'" % key)