Merge branch 'feature/UpdateFeedparser' into dev

2024-11-24 13:55:16 +00:00 · 2023-02-09 14:37:12 +00:00 · 2023-02-09 14:37:12 +00:00 · b9cfd96e57
commit b9cfd96e57
parent 7343f9ac16 a65c40083f
32 changed files with 391 additions and 193 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -7,6 +7,7 @@
 * Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
 * Update certifi 2022.09.24 to 2022.12.07
 * Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
+* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
 * Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
 * Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078)
 * Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967)
--- a/lib/feedparser/init.py
+++ b/lib/feedparser/init.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -32,7 +32,7 @@ from .util import FeedParserDict

 __author__ = 'Kurt McKee <contactme@kurtmckee.org>'
 __license__ = 'BSD 2-clause'
-__version__ = '6.0.1'
+__version__ = '6.0.10'

 # HTTP "User-Agent" header to send to servers when downloading feeds.
 # If you are embedding feedparser in a larger application, you should
--- a/lib/feedparser/api.py
+++ b/lib/feedparser/api.py
@ -1,5 +1,5 @@
 # The public API for feedparser
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -26,7 +26,11 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

+import datetime
 import io
+import time
+from typing import Dict, List, Union
+import urllib.error
 import urllib.parse
 import xml.sax

@ -34,13 +38,12 @@ import sgmllib3k as sgmllib

 from .datetimes import registerDateHandler, _parse_date
 from .encodings import convert_to_utf8
-from .exceptions import *
-from .html import _BaseHTMLProcessor
+from .html import BaseHTMLProcessor
 from . import http
-from . import mixin
-from .mixin import _FeedParserMixin
-from .parsers.loose import _LooseFeedParser
-from .parsers.strict import _StrictFeedParser
+from .mixin import XMLParserMixin
+from .parsers.loose import LooseXMLParser
+from .parsers.strict import StrictXMLParser
+from .parsers.json import JSONParser
 from .sanitizer import replace_doctype
 from .urls import convert_to_idn, make_safe_absolute_uri
 from .util import FeedParserDict
@ -70,6 +73,7 @@ SUPPORTED_VERSIONS = {
    'atom10': 'Atom 1.0',
    'atom': 'Atom (unknown version)',
    'cdf': 'CDF',
+    'json1': 'JSON feed 1',
 }


@ -136,20 +140,25 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
    return url_file_stream_or_string


-LooseFeedParser = type(
-    'LooseFeedParser',
-    (_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
-    {},
-)
+class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
+    pass

-StrictFeedParser = type(
-    'StrictFeedParser',
-    (_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
-    {},
-)
+class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler):
+    pass


-def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
+def parse(
+        url_file_stream_or_string,
+        etag: str = None,
+        modified: Union[str, datetime.datetime, time.struct_time] = None,
+        agent: str = None,
+        referrer: str = None,
+        handlers: List = None,
+        request_headers: Dict[str, str] = None,
+        response_headers: Dict[str, str] = None,
+        resolve_relative_uris: bool = None,
+        sanitize_html: bool = None,
+) -> FeedParserDict:
    """Parse a feed from a URL, file, stream, or string.

    :param url_file_stream_or_string:
@ -165,45 +174,46 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
        When a URL is not passed the feed location to use in relative URL
        resolution should be passed in the ``Content-Location`` response header
        (see ``response_headers`` below).
-
-    :param str etag: HTTP ``ETag`` request header.
-    :param modified: HTTP ``Last-Modified`` request header.
-    :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
-        :class:`datetime.datetime`
-    :param str agent: HTTP ``User-Agent`` request header, which defaults to
+    :param etag:
+        HTTP ``ETag`` request header.
+    :param modified:
+        HTTP ``Last-Modified`` request header.
+    :param agent:
+        HTTP ``User-Agent`` request header, which defaults to
        the value of :data:`feedparser.USER_AGENT`.
-    :param referrer: HTTP ``Referer`` [sic] request header.
+    :param referrer:
+        HTTP ``Referer`` [sic] request header.
+    :param handlers:
+        A list of handlers that will be passed to urllib2.
    :param request_headers:
        A mapping of HTTP header name to HTTP header value to add to the
        request, overriding internally generated values.
-    :type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
    :param response_headers:
        A mapping of HTTP header name to HTTP header value. Multiple values may
        be joined with a comma. If a HTTP request was made, these headers
        override any matching headers in the response. Otherwise this specifies
        the entirety of the response headers.
-    :type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
-
-    :param bool resolve_relative_uris:
+    :param resolve_relative_uris:
        Should feedparser attempt to resolve relative URIs absolute ones within
        HTML content?  Defaults to the value of
        :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
-    :param bool sanitize_html:
+    :param sanitize_html:
        Should feedparser skip HTML sanitization? Only disable this if you know
        what you are doing!  Defaults to the value of
        :data:`feedparser.SANITIZE_HTML`, which is ``True``.

-    :return: A :class:`FeedParserDict`.
    """

-    if not agent or sanitize_html is None or resolve_relative_uris is None:
-        import feedparser
+    # Avoid a cyclic import.
    if not agent:
+        import feedparser
        agent = feedparser.USER_AGENT
    if sanitize_html is None:
-        sanitize_html = feedparser.SANITIZE_HTML
+        import feedparser
+        sanitize_html = bool(feedparser.SANITIZE_HTML)
    if resolve_relative_uris is None:
-        resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
+        import feedparser
+        resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)

    result = FeedParserDict(
        bozo=False,
@ -212,7 +222,14 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
        headers={},
    )

+    try:
        data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
+    except urllib.error.URLError as error:
+        result.update({
+            'bozo': True,
+            'bozo_exception': error,
+        })
+        return result

    if not data:
        return result
@ -221,8 +238,10 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
    result['headers'].update(response_headers or {})

    data = convert_to_utf8(result['headers'], data, result)
+    use_json_parser = result['content-type'] == 'application/json'
    use_strict_parser = result['encoding'] and True or False

+    if not use_json_parser:
        result['version'], data, entities = replace_doctype(data)

    # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
@ -235,36 +254,52 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
        baselang = baselang.decode('utf-8', 'ignore')

    if not _XML_AVAILABLE:
-        use_strict_parser = 0
-    if use_strict_parser:
-        # initialize the SAX parser
-        feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
-        feedparser.resolve_relative_uris = resolve_relative_uris
-        feedparser.sanitize_html = sanitize_html
+        use_strict_parser = False
+    feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
+    if use_json_parser:
+        result['version'] = None
+        feed_parser = JSONParser(baseuri, baselang, 'utf-8')
+        try:
+            feed_parser.feed(data)
+        except Exception as e:
+            result['bozo'] = 1
+            result['bozo_exception'] = e
+    elif use_strict_parser:
+        # Initialize the SAX parser.
+        feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
+        feed_parser.resolve_relative_uris = resolve_relative_uris
+        feed_parser.sanitize_html = sanitize_html
        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
        try:
-            # disable downloading external doctype references, if possible
+            # Disable downloading external doctype references, if possible.
            saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
        except xml.sax.SAXNotSupportedException:
            pass
-        saxparser.setContentHandler(feedparser)
-        saxparser.setErrorHandler(feedparser)
+        saxparser.setContentHandler(feed_parser)
+        saxparser.setErrorHandler(feed_parser)
        source = xml.sax.xmlreader.InputSource()
        source.setByteStream(io.BytesIO(data))
        try:
            saxparser.parse(source)
        except xml.sax.SAXException as e:
            result['bozo'] = 1
-            result['bozo_exception'] = feedparser.exc or e
-            use_strict_parser = 0
-    if not use_strict_parser:
-        feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
-        feedparser.resolve_relative_uris = resolve_relative_uris
-        feedparser.sanitize_html = sanitize_html
-        feedparser.feed(data.decode('utf-8', 'replace'))
-    result['feed'] = feedparser.feeddata
-    result['entries'] = feedparser.entries
-    result['version'] = result['version'] or feedparser.version
-    result['namespaces'] = feedparser.namespaces_in_use
+            result['bozo_exception'] = feed_parser.exc or e
+            use_strict_parser = False
+
+    # The loose XML parser will be tried if the JSON parser was not used,
+    # and if the strict XML parser was not used (or if it failed).
+    if not use_json_parser and not use_strict_parser:
+        feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
+        feed_parser.resolve_relative_uris = resolve_relative_uris
+        feed_parser.sanitize_html = sanitize_html
+        feed_parser.feed(data.decode('utf-8', 'replace'))
+
+    result['feed'] = feed_parser.feeddata
+    result['entries'] = feed_parser.entries
+    result['version'] = result['version'] or feed_parser.version
+    if isinstance(feed_parser, JSONParser):
+        result['namespaces'] = {}
+    else:    
+        result['namespaces'] = feed_parser.namespaces_in_use
    return result
--- a/lib/feedparser/datetimes/init.py
+++ b/lib/feedparser/datetimes/init.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -25,6 +25,8 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

+from time import struct_time
+from typing import Callable, List, Optional
 from .asctime import _parse_date_asctime
 from .greek import _parse_date_greek
 from .hungarian import _parse_date_hungarian
@ -34,7 +36,7 @@ from .perforce import _parse_date_perforce
 from .rfc822 import _parse_date_rfc822
 from .w3dtf import _parse_date_w3dtf

-_date_handlers = []
+_date_handlers: List[Callable[[str], Optional[struct_time]]] = []


 def registerDateHandler(func):
--- a/lib/feedparser/datetimes/asctime.py
+++ b/lib/feedparser/datetimes/asctime.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/datetimes/greek.py
+++ b/lib/feedparser/datetimes/greek.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/datetimes/hungarian.py
+++ b/lib/feedparser/datetimes/hungarian.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/datetimes/iso8601.py
+++ b/lib/feedparser/datetimes/iso8601.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -68,15 +68,7 @@ _iso8601_re = [
    + r'(\.(?P<fracsecond>\d+))?'
    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
    for tmpl in _iso8601_tmpl]
-try:
-    del tmpl
-except NameError:
-    pass
 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
-try:
-    del regex
-except NameError:
-    pass


 def _parse_date_iso8601(date_string):
--- a/lib/feedparser/datetimes/korean.py
+++ b/lib/feedparser/datetimes/korean.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/datetimes/perforce.py
+++ b/lib/feedparser/datetimes/perforce.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -25,7 +25,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

-import email._parseaddr
+import email.utils
 import re
 import time

@ -41,6 +41,6 @@ def _parse_date_perforce(date_string):
    dow, year, month, day, hour, minute, second, tz = m.groups()
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
-    tm = email._parseaddr.parsedate_tz(new_date_string)
+    tm = email.utils.parsedate_tz(new_date_string)
    if tm:
-        return time.gmtime(email._parseaddr.mktime_tz(tm))
+        return time.gmtime(email.utils.mktime_tz(tm))
--- a/lib/feedparser/datetimes/rfc822.py
+++ b/lib/feedparser/datetimes/rfc822.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/datetimes/w3dtf.py
+++ b/lib/feedparser/datetimes/w3dtf.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/encodings.py
+++ b/lib/feedparser/encodings.py
@ -1,5 +1,5 @@
 # Character encoding routines
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -26,17 +26,16 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

-import cgi
 import codecs
 import re
+import typing as t

 try:
    try:
-        import cchardet as chardet
+        import cchardet as chardet # type: ignore[import]
    except ImportError:
-        import chardet
+        import chardet # type: ignore[no-redef]
 except ImportError:
-    chardet = None
    lazy_chardet_encoding = None
 else:
    def lazy_chardet_encoding(data):
@ -68,6 +67,30 @@ RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>')
 RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')


+def parse_content_type(line: str) -> t.Tuple[str, str]:
+    """Parse an HTTP Content-Type header.
+
+    The return value will be a tuple of strings:
+    the MIME type, and the value of the "charset" (if any).
+
+    This is a custom replacement for Python's cgi.parse_header().
+    The cgi module will be removed in Python 3.13.
+    """
+
+    chunks = line.split(";")
+    if not chunks:
+        return "", ""
+
+    mime_type = chunks[0].strip()
+    charset_value = ""
+    for chunk in chunks[1:]:
+        key, _, value = chunk.partition("=")
+        if key.strip().lower() == "charset":
+            charset_value = value.strip().strip("\"'")
+
+    return mime_type, charset_value
+
+
 def convert_to_utf8(http_headers, data, result):
    """Detect and convert the character encoding to UTF-8.

@ -156,10 +179,7 @@ def convert_to_utf8(http_headers, data, result):
    try:
        if bom_encoding:
            tempdata = data.decode(bom_encoding).encode('utf-8')
-    except (UnicodeDecodeError, LookupError):
-        # feedparser recognizes UTF-32 encodings that aren't
-        # available in Python 2.4 and 2.5, so it's possible to
-        # encounter a LookupError during decoding.
+    except UnicodeDecodeError:
        xml_encoding_match = None
    else:
        xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
@ -181,15 +201,14 @@ def convert_to_utf8(http_headers, data, result):
    # XML declaration encoding, and HTTP encoding, following the
    # heuristic defined in RFC 3023.
    http_content_type = http_headers.get('content-type') or ''
-    http_content_type, params = cgi.parse_header(http_content_type)
-    http_encoding = params.get('charset', '').replace("'", "")
-    if isinstance(http_encoding, bytes):
-        http_encoding = http_encoding.decode('utf-8', 'ignore')
+    http_content_type, http_encoding = parse_content_type(http_content_type)

    acceptable_content_type = 0
    application_content_types = ('application/xml', 'application/xml-dtd',
                                 'application/xml-external-parsed-entity')
    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
+    json_content_types = ('application/feed+json', 'application/json')
+    json = False
    if (
            http_content_type in application_content_types
            or (
@ -208,6 +227,17 @@ def convert_to_utf8(http_headers, data, result):
    ):
        acceptable_content_type = 1
        rfc3023_encoding = http_encoding or 'us-ascii'
+    elif (
+            http_content_type in json_content_types
+            or (
+                    not http_content_type
+                    and data and data.lstrip()[0] == '{'
+            )
+    ):
+        http_content_type = json_content_types[0]
+        acceptable_content_type = 1
+        json = True
+        rfc3023_encoding = http_encoding or 'utf-8'  # RFC 7159, 8.1.
    elif http_content_type.startswith('text/'):
        rfc3023_encoding = http_encoding or 'us-ascii'
    elif http_headers and 'content-type' not in http_headers:
@ -230,7 +260,7 @@ def convert_to_utf8(http_headers, data, result):

    if http_headers and (not acceptable_content_type):
        if 'content-type' in http_headers:
-            msg = '%s is not an XML media type' % http_headers['content-type']
+            msg = '%s is not an accepted media type' % http_headers['content-type']
        else:
            msg = 'no Content-type specified'
        error = NonXMLContentType(msg)
@ -254,6 +284,7 @@ def convert_to_utf8(http_headers, data, result):
            pass
        else:
            known_encoding = 1
+            if not json:
                # Update the encoding in the opening XML processing instruction.
                new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
                if RE_XML_DECLARATION.search(data):
@ -275,6 +306,7 @@ def convert_to_utf8(http_headers, data, result):
            (rfc3023_encoding, proposed_encoding))
        rfc3023_encoding = proposed_encoding

+    result['content-type'] = http_content_type  # for selecting the parser
    result['encoding'] = rfc3023_encoding
    if error:
        result['bozo'] = True
--- a/lib/feedparser/exceptions.py
+++ b/lib/feedparser/exceptions.py
@ -1,5 +1,5 @@
 # Exceptions used throughout feedparser
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -27,7 +27,7 @@
 # POSSIBILITY OF SUCH DAMAGE.

 __all__ = [
-    'ThingsNobodyCaresAboutButMe',
+    'FeedparserError',
    'CharacterEncodingOverride',
    'CharacterEncodingUnknown',
    'NonXMLContentType',
@ -35,19 +35,19 @@ __all__ = [
 ]


-class ThingsNobodyCaresAboutButMe(Exception):
+class FeedparserError(Exception):
    pass


-class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
+class CharacterEncodingOverride(FeedparserError):
    pass


-class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
+class CharacterEncodingUnknown(FeedparserError):
    pass


-class NonXMLContentType(ThingsNobodyCaresAboutButMe):
+class NonXMLContentType(FeedparserError):
    pass


--- a/lib/feedparser/html.py
+++ b/lib/feedparser/html.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -61,7 +61,7 @@ _cp1252 = {
 }


-class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
+class BaseHTMLProcessor(sgmllib.SGMLParser):
    special = re.compile("""[<>'"]""")
    bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
    elements_no_end_tag = {
@ -91,11 +91,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
            self.encoding = encoding
        self._type = _type
        self.pieces = []
-        super(_BaseHTMLProcessor, self).__init__()
+        super().__init__()

    def reset(self):
        self.pieces = []
-        super(_BaseHTMLProcessor, self).reset()
+        super().reset()

    def _shorttag_replace(self, match):
        """
@ -118,23 +118,13 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
        raise NotImplementedError

    # Replace goahead with SGMLParser's goahead() code object.
-    try:
    goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
-    except AttributeError:
-        # Python 2
-        # noinspection PyUnresolvedReferences
-        goahead.func_code = sgmllib.SGMLParser.goahead.func_code

    def __parse_starttag(self, i):
        raise NotImplementedError

    # Replace __parse_starttag with SGMLParser's parse_starttag() code object.
-    try:
    __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
-    except AttributeError:
-        # Python 2
-        # noinspection PyUnresolvedReferences
-        __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code

    def parse_starttag(self, i):
        j = self.__parse_starttag(i)
@ -153,8 +143,8 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
        data = data.replace('&#39;', "'")
        data = data.replace('&#34;', '"')
-        super(_BaseHTMLProcessor, self).feed(data)
-        super(_BaseHTMLProcessor, self).close()
+        super().feed(data)
+        super().close()

    @staticmethod
    def normalize_attrs(attrs):
@ -315,8 +305,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
            # self.updatepos(declstartpos, i)
            return None, -1

-    @staticmethod
-    def convert_charref(name):
+    def convert_charref(self, name):
        """
        :type name: str
        :rtype: str
@ -324,8 +313,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):

        return '&#%s;' % name

-    @staticmethod
-    def convert_entityref(name):
+    def convert_entityref(self, name):
        """
        :type name: str
        :rtype: str
@ -349,7 +337,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):

        try:
            return sgmllib.SGMLParser.parse_declaration(self, i)
-        except sgmllib.SGMLParseError:
+        except (AssertionError, sgmllib.SGMLParseError):
            # Escape the doctype declaration and continue parsing.
            self.handle_data('&lt;')
            return i+1
--- a/lib/feedparser/http.py
+++ b/lib/feedparser/http.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -44,7 +44,7 @@ from .urls import convert_to_idn
 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"


-class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
+class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, headers):
        # The default implementation just raises HTTPError.
        # Forget that.
@ -53,6 +53,8 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR

    def http_error_301(self, req, fp, code, msg, hdrs):
        result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
+        if not result:
+            return fp
        result.status = code
        result.newurl = result.geturl()
        return result
@ -78,7 +80,7 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
        host = urllib.parse.urlparse(req.get_full_url())[1]
        if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
            return self.http_error_default(req, fp, code, msg, headers)
-        auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode('utf8'))
+        auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
        user, passw = auth.split(':')
        realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
        self.add_password(realm, host, user, passw)
@ -145,15 +147,26 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
            if url_pieces.port:
                new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
            url = urllib.parse.urlunparse(new_pieces)
-            auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}').strip()
+            auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()

    # iri support
    if not isinstance(url, bytes):
        url = convert_to_idn(url)

+    # Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
+    bits = []
+    for c in url:
+        try:
+            c.encode('ascii')
+        except UnicodeEncodeError:
+            bits.append(urllib.parse.quote(c))
+        else:
+            bits.append(c)
+    url = ''.join(bits)
+
    # try to open with urllib2 (to use optional headers)
    request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
-    opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
+    opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
    opener.addheaders = []  # RMK - must clear so we only send our custom User-Agent
    f = opener.open(request)
    data = f.read()
@ -203,7 +216,7 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
        result['href'] = f.url.decode('utf-8', 'ignore')
    else:
        result['href'] = f.url
-    result['status'] = getattr(f, 'status', 200)
+    result['status'] = getattr(f, 'status', None) or 200

    # Stop processing if the server sent HTTP 304 Not Modified.
    if getattr(f, 'code', 0) == 304:
--- a/lib/feedparser/mixin.py
+++ b/lib/feedparser/mixin.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -30,16 +30,17 @@ import binascii
 import copy
 import html.entities
 import re
+from typing import Dict
 import xml.sax.saxutils

 from .html import _cp1252
 from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
-from .sanitizer import _sanitize_html, _HTMLSanitizer
+from .sanitizer import sanitize_html, HTMLSanitizer
 from .util import FeedParserDict
 from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris


-class _FeedParserMixin(
+class XMLParserMixin(
        _base.Namespace,
        cc.Namespace,
        dc.Namespace,
@ -118,7 +119,7 @@ class _FeedParserMixin(
        'http://www.w3.org/XML/1998/namespace':                  'xml',
        'http://podlove.org/simple-chapters':                    'psc',
    }
-    _matchnamespaces = {}
+    _matchnamespaces: Dict[str, str] = {}

    can_be_relative_uri = {
        'comments',
@ -170,6 +171,8 @@ class _FeedParserMixin(
        self.entries = []  # list of entry-level data
        self.version = ''  # feed type/version, see SUPPORTED_VERSIONS
        self.namespaces_in_use = {}  # dictionary of namespaces defined by the feed
+        self.resolve_relative_uris = False
+        self.sanitize_html = False

        # the following are used internally to track state;
        # this is really out of control and should be refactored
@ -193,6 +196,7 @@ class _FeedParserMixin(
        self.svgOK = 0
        self.title_depth = -1
        self.depth = 0
+        self.hasContent = 0
        if self.lang:
            self.feeddata['language'] = self.lang.replace('_', '-')

@ -204,7 +208,7 @@ class _FeedParserMixin(
        #         },
        #     }
        self.property_depth_map = {}
-        super(_FeedParserMixin, self).__init__()
+        super(XMLParserMixin, self).__init__()

    def _normalize_attributes(self, kv):
        raise NotImplementedError
@ -506,9 +510,7 @@ class _FeedParserMixin(
        if base64 and self.contentparams.get('base64', 0):
            try:
                output = base64.decodebytes(output.encode('utf8')).decode('utf8')
-            except binascii.Error:
-                pass
-            except binascii.Incomplete:
+            except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
                pass

        # resolve relative URIs
@ -546,7 +548,7 @@ class _FeedParserMixin(
        # sanitize embedded markup
        if is_htmlish and self.sanitize_html:
            if element in self.can_contain_dangerous_markup:
-                output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
+                output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))

        if self.encoding and isinstance(output, bytes):
            output = output.decode(self.encoding, 'ignore')
@ -648,7 +650,7 @@ class _FeedParserMixin(
            return False

        # all tags must be in a restricted subset of valid HTML tags
-        if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in _HTMLSanitizer.acceptable_elements)):
+        if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in HTMLSanitizer.acceptable_elements)):
            return False

        # all entities must have been defined as valid HTML entities
@ -744,7 +746,7 @@ class _FeedParserMixin(
            author, email = context.get(key), None
            if not author:
                return
-            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
+            emailmatch = re.search(r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))(\?subject=\S+)?", author)
            if emailmatch:
                email = emailmatch.group(0)
                # probably a better way to do the following, but it passes
--- a/lib/feedparser/namespaces/_base.py
+++ b/lib/feedparser/namespaces/_base.py
@ -1,5 +1,5 @@
 # Support for the Atom, RSS, RDF, and CDF feed formats
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -259,6 +259,7 @@ class Namespace(object):
    def _end_item(self):
        self.pop('item')
        self.inentry = 0
+        self.hasContent = 0
    _end_entry = _end_item

    def _start_language(self, attrs_d):
@ -388,7 +389,7 @@ class Namespace(object):

    def _start_description(self, attrs_d):
        context = self._get_context()
-        if 'summary' in context:
+        if 'summary' in context and not self.hasContent:
            self._summaryKey = 'content'
            self._start_content(attrs_d)
        else:
@ -429,7 +430,7 @@ class Namespace(object):

    def _start_summary(self, attrs_d):
        context = self._get_context()
-        if 'summary' in context:
+        if 'summary' in context and not self.hasContent:
            self._summaryKey = 'content'
            self._start_content(attrs_d)
        else:
@ -466,6 +467,7 @@ class Namespace(object):
        self.sourcedata.clear()

    def _start_content(self, attrs_d):
+        self.hasContent = 1
        self.push_content('content', attrs_d, 'text/plain', 1)
        src = attrs_d.get('src')
        if src:
@ -477,6 +479,7 @@ class Namespace(object):
    _start_xhtml_body = _start_body

    def _start_content_encoded(self, attrs_d):
+        self.hasContent = 1
        self.push_content('content', attrs_d, 'text/html', 1)
    _start_fullitem = _start_content_encoded

--- a/lib/feedparser/namespaces/admin.py
+++ b/lib/feedparser/namespaces/admin.py
@ -1,5 +1,5 @@
 # Support for the administrative elements extension
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/namespaces/cc.py
+++ b/lib/feedparser/namespaces/cc.py
@ -1,5 +1,5 @@
 # Support for the Creative Commons licensing extensions
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/namespaces/dc.py
+++ b/lib/feedparser/namespaces/dc.py
@ -1,5 +1,5 @@
 # Support for the Dublin Core metadata extensions
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/namespaces/georss.py
+++ b/lib/feedparser/namespaces/georss.py
@ -1,5 +1,5 @@
 # Support for the GeoRSS format
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -91,6 +91,8 @@ class Namespace(object):
        except ValueError:
            srs_dimension = 2
        context = self._get_context()
+        if 'where' not in context:
+            context['where'] = {}
        context['where']['srsName'] = srs_name
        context['where']['srsDimension'] = srs_dimension

--- a/lib/feedparser/namespaces/itunes.py
+++ b/lib/feedparser/namespaces/itunes.py
@ -1,5 +1,5 @@
 # Support for the iTunes format
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/namespaces/mediarss.py
+++ b/lib/feedparser/namespaces/mediarss.py
@ -1,5 +1,5 @@
 # Support for the Media RSS format
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/namespaces/psc.py
+++ b/lib/feedparser/namespaces/psc.py
@ -1,5 +1,5 @@
 # Support for the Podlove Simple Chapters format
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
--- a/lib/feedparser/parsers/json.py
+++ b/lib/feedparser/parsers/json.py
@ -0,0 +1,133 @@
+# The JSON feed parser
+# Copyright 2017 Beat Bolli
+# All rights reserved.
+#
+# This file is a part of feedparser.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+from ..datetimes import _parse_date
+from ..sanitizer import sanitize_html
+from ..util import FeedParserDict
+
+
+class JSONParser:
+    VERSIONS = {
+        'https://jsonfeed.org/version/1': 'json1',
+        'https://jsonfeed.org/version/1.1': 'json11',
+    }
+    FEED_FIELDS = (
+        ('title', 'title'),
+        ('icon', 'image'),
+        ('home_page_url', 'link'),
+        ('description', 'description'),
+    )
+    ITEM_FIELDS = (
+        ('title', 'title'),
+        ('id', 'guid'),
+        ('url', 'link'),
+        ('summary', 'summary'),
+        ('external_url', 'source'),
+    )
+
+    def __init__(self, baseuri=None, baselang=None, encoding=None):
+        self.baseuri = baseuri or ''
+        self.lang = baselang or None
+        self.encoding = encoding or 'utf-8'  # character encoding
+
+        self.version = None
+        self.feeddata = FeedParserDict()
+        self.namespacesInUse = []
+        self.entries = []
+
+    def feed(self, data):
+        data = json.loads(data)
+
+        v = data.get('version', '')
+        try:
+            self.version = self.VERSIONS[v]
+        except KeyError:
+            raise ValueError("Unrecognized JSONFeed version '%s'" % v)
+
+        for src, dst in self.FEED_FIELDS:
+            if src in data:
+                self.feeddata[dst] = data[src]
+        if 'author' in data:
+            self.parse_author(data['author'], self.feeddata)
+        # TODO: hubs; expired has no RSS equivalent
+
+        self.entries = [self.parse_entry(e) for e in data['items']]
+
+    def parse_entry(self, e):
+        entry = FeedParserDict()
+        for src, dst in self.ITEM_FIELDS:
+            if src in e:
+                entry[dst] = e[src]
+
+        if 'content_text' in e:
+            entry['content'] = c = FeedParserDict()
+            c['value'] = e['content_text']
+            c['type'] = 'text'
+        elif 'content_html' in e:
+            entry['content'] = c = FeedParserDict()
+            c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json')
+            c['type'] = 'html'
+
+        if 'date_published' in e:
+            entry['published'] = e['date_published']
+            entry['published_parsed'] = _parse_date(e['date_published'])
+        if 'date_updated' in e:
+            entry['updated'] = e['date_modified']
+            entry['updated_parsed'] = _parse_date(e['date_modified'])
+
+        if 'tags' in e:
+            entry['category'] = e['tags']
+
+        if 'author' in e:
+            self.parse_author(e['author'], entry)
+
+        if 'attachments' in e:
+            entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']]
+
+        return entry
+
+    @staticmethod
+    def parse_author(parent, dest):
+        dest['author_detail'] = detail = FeedParserDict()
+        if 'name' in parent:
+            dest['author'] = detail['name'] = parent['name']
+        if 'url' in parent:
+            if parent['url'].startswith('mailto:'):
+                detail['email'] = parent['url'][7:]
+            else:
+                detail['href'] = parent['url']
+
+    @staticmethod
+    def parse_attachment(attachment):
+        enc = FeedParserDict()
+        enc['href'] = attachment['url']
+        enc['type'] = attachment['mime_type']
+        if 'size_in_bytes' in attachment:
+            enc['length'] = attachment['size_in_bytes']
+        return enc
--- a/lib/feedparser/parsers/loose.py
+++ b/lib/feedparser/parsers/loose.py
@ -1,5 +1,5 @@
 # The loose feed parser that interfaces with an SGML parsing library
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -26,7 +26,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

-class _LooseFeedParser(object):
+class LooseXMLParser:
    contentparams = None

    def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
@ -34,7 +34,7 @@ class _LooseFeedParser(object):
        self.lang = baselang or None
        self.encoding = encoding or 'utf-8'  # character encoding
        self.entities = entities or {}
-        super(_LooseFeedParser, self).__init__()
+        super().__init__()

    @staticmethod
    def _normalize_attributes(kv):
--- a/lib/feedparser/parsers/strict.py
+++ b/lib/feedparser/parsers/strict.py
@ -1,5 +1,5 @@
 # The strict feed parser that interfaces with an XML parsing library
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -29,7 +29,7 @@
 from ..exceptions import UndeclaredNamespace


-class _StrictFeedParser(object):
+class StrictXMLParser:
    def __init__(self, baseuri, baselang, encoding):
        self.bozo = 0
        self.exc = None
@ -37,7 +37,7 @@ class _StrictFeedParser(object):
        self.baseuri = baseuri or ''
        self.lang = baselang
        self.encoding = encoding
-        super(_StrictFeedParser, self).__init__()
+        super(StrictXMLParser, self).__init__()

    @staticmethod
    def _normalize_attributes(kv):
--- a/lib/feedparser/sanitizer.py
+++ b/lib/feedparser/sanitizer.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -27,11 +27,11 @@

 import re

-from .html import _BaseHTMLProcessor
+from .html import BaseHTMLProcessor
 from .urls import make_safe_absolute_uri


-class _HTMLSanitizer(_BaseHTMLProcessor):
+class HTMLSanitizer(BaseHTMLProcessor):
    acceptable_elements = {
        'a',
        'abbr',
@ -732,14 +732,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
    }

    def __init__(self, encoding=None, _type='application/xhtml+xml'):
-        super(_HTMLSanitizer, self).__init__(encoding, _type)
+        super().__init__(encoding, _type)

        self.unacceptablestack = 0
        self.mathmlOK = 0
        self.svgOK = 0

    def reset(self):
-        super(_HTMLSanitizer, self).reset()
+        super().reset()
        self.unacceptablestack = 0
        self.mathmlOK = 0
        self.svgOK = 0
@ -805,7 +805,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
                if key == 'href':
                    value = make_safe_absolute_uri(value)
                clean_attrs.append((key, value))
-        super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
+        super().unknown_starttag(tag, clean_attrs)

    def unknown_endtag(self, tag):
        if tag not in self.acceptable_elements:
@ -820,7 +820,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
                    self.svgOK -= 1
            else:
                return
-        super(_HTMLSanitizer, self).unknown_endtag(tag)
+        super().unknown_endtag(tag)

    def handle_pi(self, text):
        pass
@ -830,7 +830,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):

    def handle_data(self, text):
        if not self.unacceptablestack:
-            super(_HTMLSanitizer, self).handle_data(text)
+            super().handle_data(text)

    def sanitize_style(self, style):
        # disallow urls
@ -865,7 +865,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
        return ' '.join(clean)

    def parse_comment(self, i, report=1):
-        ret = super(_HTMLSanitizer, self).parse_comment(i, report)
+        ret = super().parse_comment(i, report)
        if ret >= 0:
            return ret
        # if ret == -1, this may be a malicious attempt to circumvent
@ -877,8 +877,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
        return len(self.rawdata)


-def _sanitize_html(html_source, encoding, _type):
-    p = _HTMLSanitizer(encoding, _type)
+def sanitize_html(html_source, encoding, _type):
+    p = HTMLSanitizer(encoding, _type)
    html_source = html_source.replace('<![CDATA[', '&lt;![CDATA[')
    p.feed(html_source)
    data = p.output()
--- a/lib/feedparser/sgml.py
+++ b/lib/feedparser/sgml.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -27,7 +27,7 @@

 import re

-import sgmllib
+import sgmllib # type: ignore[import]

 __all__ = [
    'sgmllib',
@ -82,7 +82,7 @@ class _EndBracketRegEx:
        match = self.endbracket.match(target, index)
        if match is not None:
            # Returning a new object in the calling thread's context
-            # resolves a thread-safety.
+            # resolves a thread-safety issue.
            return EndBracketMatch(match)
        return None

--- a/lib/feedparser/urls.py
+++ b/lib/feedparser/urls.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -28,7 +28,7 @@
 import re
 import urllib.parse

-from .html import _BaseHTMLProcessor
+from .html import BaseHTMLProcessor

 # If you want feedparser to allow all URL schemes, set this to ()
 # List culled from Python's urlparse documentation at:
@ -103,7 +103,7 @@ def make_safe_absolute_uri(base, rel=None):
    return uri


-class RelativeURIResolver(_BaseHTMLProcessor):
+class RelativeURIResolver(BaseHTMLProcessor):
    relative_uris = {
        ('a', 'href'),
        ('applet', 'codebase'),
@ -137,7 +137,7 @@ class RelativeURIResolver(_BaseHTMLProcessor):
    }

    def __init__(self, baseuri, encoding, _type):
-        _BaseHTMLProcessor.__init__(self, encoding, _type)
+        BaseHTMLProcessor.__init__(self, encoding, _type)
        self.baseuri = baseuri

    def resolve_uri(self, uri):
--- a/lib/feedparser/util.py
+++ b/lib/feedparser/util.py
@ -1,4 +1,4 @@
-# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
+# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@ -48,7 +48,7 @@ class FeedParserDict(dict):
        'tagline_detail': 'subtitle_detail',
    }

-    def __getitem__(self, key):
+    def __getitem__(self, key, _stacklevel=2):
        """
        :return: A :class:`FeedParserDict`.
        """
@ -59,9 +59,8 @@ class FeedParserDict(dict):
            except IndexError:
                raise KeyError("object doesn't have key 'category'")
        elif key == 'enclosures':
-            norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
            return [
-                norel(link)
+                FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
                for link in dict.__getitem__(self, 'links')
                if link['rel'] == 'enclosure'
            ]
@ -84,6 +83,7 @@ class FeedParserDict(dict):
                    "exist. This fallback will be removed in a future version "
                    "of feedparser.",
                    DeprecationWarning,
+                    stacklevel=_stacklevel,
                )
                return dict.__getitem__(self, 'published')
            return dict.__getitem__(self, 'updated')
@ -99,6 +99,7 @@ class FeedParserDict(dict):
                    "`updated_parsed` doesn't exist. This fallback will be "
                    "removed in a future version of feedparser.",
                    DeprecationWarning,
+                    stacklevel=_stacklevel,
                )
                return dict.__getitem__(self, 'published_parsed')
            return dict.__getitem__(self, 'updated_parsed')
@ -119,7 +120,7 @@ class FeedParserDict(dict):
            # This fix was proposed in issue 328.
            return dict.__contains__(self, key)
        try:
-            self.__getitem__(key)
+            self.__getitem__(key, _stacklevel=3)
        except KeyError:
            return False
        else:
@ -133,7 +134,7 @@ class FeedParserDict(dict):
        """

        try:
-            return self.__getitem__(key)
+            return self.__getitem__(key, _stacklevel=3)
        except KeyError:
            return default

@ -143,17 +144,11 @@ class FeedParserDict(dict):
            key = key[0]
        return dict.__setitem__(self, key, value)

-    def setdefault(self, k, default):
-        if k not in self:
-            self[k] = default
-            return default
-        return self[k]
-
    def __getattr__(self, key):
        # __getattribute__() is called first; this will be called
        # only if an attribute was not already found
        try:
-            return self.__getitem__(key)
+            return self.__getitem__(key, _stacklevel=3)
        except KeyError:
            raise AttributeError("object has no attribute '%s'" % key)