From a65c40083ff4357cff126e3562b51aaaf3b53801 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Fri, 13 Jan 2023 20:16:45 +0000 Subject: [PATCH] =?UTF-8?q?Update=20feedparser=206.0.1=20(98d189fa)=20?= =?UTF-8?q?=E2=86=92=206.0.10=20(5fcb3ae).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 1 + lib/feedparser/__init__.py | 4 +- lib/feedparser/api.py | 149 ++++++++++++++++---------- lib/feedparser/datetimes/__init__.py | 6 +- lib/feedparser/datetimes/asctime.py | 2 +- lib/feedparser/datetimes/greek.py | 2 +- lib/feedparser/datetimes/hungarian.py | 2 +- lib/feedparser/datetimes/iso8601.py | 10 +- lib/feedparser/datetimes/korean.py | 2 +- lib/feedparser/datetimes/perforce.py | 8 +- lib/feedparser/datetimes/rfc822.py | 2 +- lib/feedparser/datetimes/w3dtf.py | 2 +- lib/feedparser/encodings.py | 72 +++++++++---- lib/feedparser/exceptions.py | 12 +-- lib/feedparser/html.py | 34 ++---- lib/feedparser/http.py | 25 +++-- lib/feedparser/mixin.py | 24 +++-- lib/feedparser/namespaces/_base.py | 9 +- lib/feedparser/namespaces/admin.py | 2 +- lib/feedparser/namespaces/cc.py | 2 +- lib/feedparser/namespaces/dc.py | 2 +- lib/feedparser/namespaces/georss.py | 4 +- lib/feedparser/namespaces/itunes.py | 2 +- lib/feedparser/namespaces/mediarss.py | 2 +- lib/feedparser/namespaces/psc.py | 2 +- lib/feedparser/parsers/json.py | 133 +++++++++++++++++++++++ lib/feedparser/parsers/loose.py | 6 +- lib/feedparser/parsers/strict.py | 6 +- lib/feedparser/sanitizer.py | 22 ++-- lib/feedparser/sgml.py | 6 +- lib/feedparser/urls.py | 8 +- lib/feedparser/util.py | 21 ++-- 32 files changed, 391 insertions(+), 193 deletions(-) create mode 100644 lib/feedparser/parsers/json.py diff --git a/CHANGES.md b/CHANGES.md index 0a25ef74..62161156 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,6 +7,7 @@ * Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5) * Update certifi 2022.09.24 to 2022.12.07 * Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425) +* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae) * Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb) * Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078) * Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967) diff --git a/lib/feedparser/__init__.py b/lib/feedparser/__init__.py index 529340e7..1e8877c0 100644 --- a/lib/feedparser/__init__.py +++ b/lib/feedparser/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -32,7 +32,7 @@ from .util import FeedParserDict __author__ = 'Kurt McKee ' __license__ = 'BSD 2-clause' -__version__ = '6.0.1' +__version__ = '6.0.10' # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should diff --git a/lib/feedparser/api.py b/lib/feedparser/api.py index fbc96f6e..c56237be 100644 --- a/lib/feedparser/api.py +++ b/lib/feedparser/api.py @@ -1,5 +1,5 @@ # The public API for feedparser -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,7 +26,11 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +import datetime import io +import time +from typing import Dict, List, Union +import urllib.error import urllib.parse import xml.sax @@ -34,13 +38,12 @@ import sgmllib3k as sgmllib from .datetimes import registerDateHandler, _parse_date from .encodings import convert_to_utf8 -from .exceptions import * -from .html import _BaseHTMLProcessor +from .html import BaseHTMLProcessor from . import http -from . import mixin -from .mixin import _FeedParserMixin -from .parsers.loose import _LooseFeedParser -from .parsers.strict import _StrictFeedParser +from .mixin import XMLParserMixin +from .parsers.loose import LooseXMLParser +from .parsers.strict import StrictXMLParser +from .parsers.json import JSONParser from .sanitizer import replace_doctype from .urls import convert_to_idn, make_safe_absolute_uri from .util import FeedParserDict @@ -70,6 +73,7 @@ SUPPORTED_VERSIONS = { 'atom10': 'Atom 1.0', 'atom': 'Atom (unknown version)', 'cdf': 'CDF', + 'json1': 'JSON feed 1', } @@ -136,20 +140,25 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h return url_file_stream_or_string -LooseFeedParser = type( - 'LooseFeedParser', - (_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object), - {}, -) +class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor): + pass -StrictFeedParser = type( - 'StrictFeedParser', - (_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object), - {}, -) +class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler): + pass -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None): +def parse( + url_file_stream_or_string, + etag: str = None, + modified: Union[str, datetime.datetime, time.struct_time] = None, + agent: str = None, + referrer: str = None, + handlers: List = None, + request_headers: Dict[str, str] = None, + response_headers: Dict[str, str] = None, + resolve_relative_uris: bool = None, + sanitize_html: bool = None, +) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. :param url_file_stream_or_string: @@ -165,45 +174,46 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer When a URL is not passed the feed location to use in relative URL resolution should be passed in the ``Content-Location`` response header (see ``response_headers`` below). - - :param str etag: HTTP ``ETag`` request header. - :param modified: HTTP ``Last-Modified`` request header. - :type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or - :class:`datetime.datetime` - :param str agent: HTTP ``User-Agent`` request header, which defaults to + :param etag: + HTTP ``ETag`` request header. + :param modified: + HTTP ``Last-Modified`` request header. + :param agent: + HTTP ``User-Agent`` request header, which defaults to the value of :data:`feedparser.USER_AGENT`. - :param referrer: HTTP ``Referer`` [sic] request header. + :param referrer: + HTTP ``Referer`` [sic] request header. + :param handlers: + A list of handlers that will be passed to urllib2. :param request_headers: A mapping of HTTP header name to HTTP header value to add to the request, overriding internally generated values. - :type request_headers: :class:`dict` mapping :class:`str` to :class:`str` :param response_headers: A mapping of HTTP header name to HTTP header value. Multiple values may be joined with a comma. If a HTTP request was made, these headers override any matching headers in the response. Otherwise this specifies the entirety of the response headers. - :type response_headers: :class:`dict` mapping :class:`str` to :class:`str` - - :param bool resolve_relative_uris: + :param resolve_relative_uris: Should feedparser attempt to resolve relative URIs absolute ones within HTML content? Defaults to the value of :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``. - :param bool sanitize_html: + :param sanitize_html: Should feedparser skip HTML sanitization? Only disable this if you know what you are doing! Defaults to the value of :data:`feedparser.SANITIZE_HTML`, which is ``True``. - :return: A :class:`FeedParserDict`. """ - if not agent or sanitize_html is None or resolve_relative_uris is None: - import feedparser + # Avoid a cyclic import. if not agent: + import feedparser agent = feedparser.USER_AGENT if sanitize_html is None: - sanitize_html = feedparser.SANITIZE_HTML + import feedparser + sanitize_html = bool(feedparser.SANITIZE_HTML) if resolve_relative_uris is None: - resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS + import feedparser + resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS) result = FeedParserDict( bozo=False, @@ -212,7 +222,14 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer headers={}, ) - data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + try: + data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + except urllib.error.URLError as error: + result.update({ + 'bozo': True, + 'bozo_exception': error, + }) + return result if not data: return result @@ -221,9 +238,11 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer result['headers'].update(response_headers or {}) data = convert_to_utf8(result['headers'], data, result) + use_json_parser = result['content-type'] == 'application/json' use_strict_parser = result['encoding'] and True or False - result['version'], data, entities = replace_doctype(data) + if not use_json_parser: + result['version'], data, entities = replace_doctype(data) # Ensure that baseuri is an absolute URI using an acceptable URI scheme. contentloc = result['headers'].get('content-location', '') @@ -235,36 +254,52 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer baselang = baselang.decode('utf-8', 'ignore') if not _XML_AVAILABLE: - use_strict_parser = 0 - if use_strict_parser: - # initialize the SAX parser - feedparser = StrictFeedParser(baseuri, baselang, 'utf-8') - feedparser.resolve_relative_uris = resolve_relative_uris - feedparser.sanitize_html = sanitize_html + use_strict_parser = False + feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser] + if use_json_parser: + result['version'] = None + feed_parser = JSONParser(baseuri, baselang, 'utf-8') + try: + feed_parser.feed(data) + except Exception as e: + result['bozo'] = 1 + result['bozo_exception'] = e + elif use_strict_parser: + # Initialize the SAX parser. + feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8') + feed_parser.resolve_relative_uris = resolve_relative_uris + feed_parser.sanitize_html = sanitize_html saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) try: - # disable downloading external doctype references, if possible + # Disable downloading external doctype references, if possible. saxparser.setFeature(xml.sax.handler.feature_external_ges, 0) except xml.sax.SAXNotSupportedException: pass - saxparser.setContentHandler(feedparser) - saxparser.setErrorHandler(feedparser) + saxparser.setContentHandler(feed_parser) + saxparser.setErrorHandler(feed_parser) source = xml.sax.xmlreader.InputSource() source.setByteStream(io.BytesIO(data)) try: saxparser.parse(source) except xml.sax.SAXException as e: result['bozo'] = 1 - result['bozo_exception'] = feedparser.exc or e - use_strict_parser = 0 - if not use_strict_parser: - feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) - feedparser.resolve_relative_uris = resolve_relative_uris - feedparser.sanitize_html = sanitize_html - feedparser.feed(data.decode('utf-8', 'replace')) - result['feed'] = feedparser.feeddata - result['entries'] = feedparser.entries - result['version'] = result['version'] or feedparser.version - result['namespaces'] = feedparser.namespaces_in_use + result['bozo_exception'] = feed_parser.exc or e + use_strict_parser = False + + # The loose XML parser will be tried if the JSON parser was not used, + # and if the strict XML parser was not used (or if it failed). + if not use_json_parser and not use_strict_parser: + feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) + feed_parser.resolve_relative_uris = resolve_relative_uris + feed_parser.sanitize_html = sanitize_html + feed_parser.feed(data.decode('utf-8', 'replace')) + + result['feed'] = feed_parser.feeddata + result['entries'] = feed_parser.entries + result['version'] = result['version'] or feed_parser.version + if isinstance(feed_parser, JSONParser): + result['namespaces'] = {} + else: + result['namespaces'] = feed_parser.namespaces_in_use return result diff --git a/lib/feedparser/datetimes/__init__.py b/lib/feedparser/datetimes/__init__.py index 01b96f4e..9e09ec27 100644 --- a/lib/feedparser/datetimes/__init__.py +++ b/lib/feedparser/datetimes/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -25,6 +25,8 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from time import struct_time +from typing import Callable, List, Optional from .asctime import _parse_date_asctime from .greek import _parse_date_greek from .hungarian import _parse_date_hungarian @@ -34,7 +36,7 @@ from .perforce import _parse_date_perforce from .rfc822 import _parse_date_rfc822 from .w3dtf import _parse_date_w3dtf -_date_handlers = [] +_date_handlers: List[Callable[[str], Optional[struct_time]]] = [] def registerDateHandler(func): diff --git a/lib/feedparser/datetimes/asctime.py b/lib/feedparser/datetimes/asctime.py index e4432631..c4b16249 100644 --- a/lib/feedparser/datetimes/asctime.py +++ b/lib/feedparser/datetimes/asctime.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/datetimes/greek.py b/lib/feedparser/datetimes/greek.py index 91d0e1da..7f433fed 100644 --- a/lib/feedparser/datetimes/greek.py +++ b/lib/feedparser/datetimes/greek.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/datetimes/hungarian.py b/lib/feedparser/datetimes/hungarian.py index 266c0c1f..691a6ebc 100644 --- a/lib/feedparser/datetimes/hungarian.py +++ b/lib/feedparser/datetimes/hungarian.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/datetimes/iso8601.py b/lib/feedparser/datetimes/iso8601.py index 65b35b58..3d3b3f96 100644 --- a/lib/feedparser/datetimes/iso8601.py +++ b/lib/feedparser/datetimes/iso8601.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -68,15 +68,7 @@ _iso8601_re = [ + r'(\.(?P\d+))?' + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl] -try: - del tmpl -except NameError: - pass _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] -try: - del regex -except NameError: - pass def _parse_date_iso8601(date_string): diff --git a/lib/feedparser/datetimes/korean.py b/lib/feedparser/datetimes/korean.py index 1ad638d9..788d4666 100644 --- a/lib/feedparser/datetimes/korean.py +++ b/lib/feedparser/datetimes/korean.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/datetimes/perforce.py b/lib/feedparser/datetimes/perforce.py index aac23bbd..d62d722f 100644 --- a/lib/feedparser/datetimes/perforce.py +++ b/lib/feedparser/datetimes/perforce.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -25,7 +25,7 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import email._parseaddr +import email.utils import re import time @@ -41,6 +41,6 @@ def _parse_date_perforce(date_string): dow, year, month, day, hour, minute, second, tz = m.groups() months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) - tm = email._parseaddr.parsedate_tz(new_date_string) + tm = email.utils.parsedate_tz(new_date_string) if tm: - return time.gmtime(email._parseaddr.mktime_tz(tm)) + return time.gmtime(email.utils.mktime_tz(tm)) diff --git a/lib/feedparser/datetimes/rfc822.py b/lib/feedparser/datetimes/rfc822.py index 15b95ddb..871e18fd 100644 --- a/lib/feedparser/datetimes/rfc822.py +++ b/lib/feedparser/datetimes/rfc822.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/datetimes/w3dtf.py b/lib/feedparser/datetimes/w3dtf.py index 1cadca3c..6fb2c545 100644 --- a/lib/feedparser/datetimes/w3dtf.py +++ b/lib/feedparser/datetimes/w3dtf.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/encodings.py b/lib/feedparser/encodings.py index 02dfcbe0..73251fc1 100644 --- a/lib/feedparser/encodings.py +++ b/lib/feedparser/encodings.py @@ -1,5 +1,5 @@ # Character encoding routines -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,17 +26,16 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import cgi import codecs import re +import typing as t try: try: - import cchardet as chardet + import cchardet as chardet # type: ignore[import] except ImportError: - import chardet + import chardet # type: ignore[no-redef] except ImportError: - chardet = None lazy_chardet_encoding = None else: def lazy_chardet_encoding(data): @@ -68,6 +67,30 @@ RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>') RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') +def parse_content_type(line: str) -> t.Tuple[str, str]: + """Parse an HTTP Content-Type header. + + The return value will be a tuple of strings: + the MIME type, and the value of the "charset" (if any). + + This is a custom replacement for Python's cgi.parse_header(). + The cgi module will be removed in Python 3.13. + """ + + chunks = line.split(";") + if not chunks: + return "", "" + + mime_type = chunks[0].strip() + charset_value = "" + for chunk in chunks[1:]: + key, _, value = chunk.partition("=") + if key.strip().lower() == "charset": + charset_value = value.strip().strip("\"'") + + return mime_type, charset_value + + def convert_to_utf8(http_headers, data, result): """Detect and convert the character encoding to UTF-8. @@ -156,10 +179,7 @@ def convert_to_utf8(http_headers, data, result): try: if bom_encoding: tempdata = data.decode(bom_encoding).encode('utf-8') - except (UnicodeDecodeError, LookupError): - # feedparser recognizes UTF-32 encodings that aren't - # available in Python 2.4 and 2.5, so it's possible to - # encounter a LookupError during decoding. + except UnicodeDecodeError: xml_encoding_match = None else: xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) @@ -181,15 +201,14 @@ def convert_to_utf8(http_headers, data, result): # XML declaration encoding, and HTTP encoding, following the # heuristic defined in RFC 3023. http_content_type = http_headers.get('content-type') or '' - http_content_type, params = cgi.parse_header(http_content_type) - http_encoding = params.get('charset', '').replace("'", "") - if isinstance(http_encoding, bytes): - http_encoding = http_encoding.decode('utf-8', 'ignore') + http_content_type, http_encoding = parse_content_type(http_content_type) acceptable_content_type = 0 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') text_content_types = ('text/xml', 'text/xml-external-parsed-entity') + json_content_types = ('application/feed+json', 'application/json') + json = False if ( http_content_type in application_content_types or ( @@ -208,6 +227,17 @@ def convert_to_utf8(http_headers, data, result): ): acceptable_content_type = 1 rfc3023_encoding = http_encoding or 'us-ascii' + elif ( + http_content_type in json_content_types + or ( + not http_content_type + and data and data.lstrip()[0] == '{' + ) + ): + http_content_type = json_content_types[0] + acceptable_content_type = 1 + json = True + rfc3023_encoding = http_encoding or 'utf-8' # RFC 7159, 8.1. elif http_content_type.startswith('text/'): rfc3023_encoding = http_encoding or 'us-ascii' elif http_headers and 'content-type' not in http_headers: @@ -230,7 +260,7 @@ def convert_to_utf8(http_headers, data, result): if http_headers and (not acceptable_content_type): if 'content-type' in http_headers: - msg = '%s is not an XML media type' % http_headers['content-type'] + msg = '%s is not an accepted media type' % http_headers['content-type'] else: msg = 'no Content-type specified' error = NonXMLContentType(msg) @@ -254,12 +284,13 @@ def convert_to_utf8(http_headers, data, result): pass else: known_encoding = 1 - # Update the encoding in the opening XML processing instruction. - new_declaration = '''''' - if RE_XML_DECLARATION.search(data): - data = RE_XML_DECLARATION.sub(new_declaration, data) - else: - data = new_declaration + '\n' + data + if not json: + # Update the encoding in the opening XML processing instruction. + new_declaration = '''''' + if RE_XML_DECLARATION.search(data): + data = RE_XML_DECLARATION.sub(new_declaration, data) + else: + data = new_declaration + '\n' + data data = data.encode('utf-8') break # if still no luck, give up @@ -275,6 +306,7 @@ def convert_to_utf8(http_headers, data, result): (rfc3023_encoding, proposed_encoding)) rfc3023_encoding = proposed_encoding + result['content-type'] = http_content_type # for selecting the parser result['encoding'] = rfc3023_encoding if error: result['bozo'] = True diff --git a/lib/feedparser/exceptions.py b/lib/feedparser/exceptions.py index 7fa28a1b..0ddb0024 100644 --- a/lib/feedparser/exceptions.py +++ b/lib/feedparser/exceptions.py @@ -1,5 +1,5 @@ # Exceptions used throughout feedparser -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,7 +27,7 @@ # POSSIBILITY OF SUCH DAMAGE. __all__ = [ - 'ThingsNobodyCaresAboutButMe', + 'FeedparserError', 'CharacterEncodingOverride', 'CharacterEncodingUnknown', 'NonXMLContentType', @@ -35,19 +35,19 @@ __all__ = [ ] -class ThingsNobodyCaresAboutButMe(Exception): +class FeedparserError(Exception): pass -class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): +class CharacterEncodingOverride(FeedparserError): pass -class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): +class CharacterEncodingUnknown(FeedparserError): pass -class NonXMLContentType(ThingsNobodyCaresAboutButMe): +class NonXMLContentType(FeedparserError): pass diff --git a/lib/feedparser/html.py b/lib/feedparser/html.py index de1c3f6c..48ddb924 100644 --- a/lib/feedparser/html.py +++ b/lib/feedparser/html.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -61,7 +61,7 @@ _cp1252 = { } -class _BaseHTMLProcessor(sgmllib.SGMLParser, object): +class BaseHTMLProcessor(sgmllib.SGMLParser): special = re.compile("""[<>'"]""") bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") elements_no_end_tag = { @@ -91,11 +91,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object): self.encoding = encoding self._type = _type self.pieces = [] - super(_BaseHTMLProcessor, self).__init__() + super().__init__() def reset(self): self.pieces = [] - super(_BaseHTMLProcessor, self).reset() + super().reset() def _shorttag_replace(self, match): """ @@ -118,23 +118,13 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object): raise NotImplementedError # Replace goahead with SGMLParser's goahead() code object. - try: - goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ - except AttributeError: - # Python 2 - # noinspection PyUnresolvedReferences - goahead.func_code = sgmllib.SGMLParser.goahead.func_code + goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ def __parse_starttag(self, i): raise NotImplementedError # Replace __parse_starttag with SGMLParser's parse_starttag() code object. - try: - __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ - except AttributeError: - # Python 2 - # noinspection PyUnresolvedReferences - __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code + __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ def parse_starttag(self, i): j = self.__parse_starttag(i) @@ -153,8 +143,8 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object): data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') - super(_BaseHTMLProcessor, self).feed(data) - super(_BaseHTMLProcessor, self).close() + super().feed(data) + super().close() @staticmethod def normalize_attrs(attrs): @@ -315,8 +305,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object): # self.updatepos(declstartpos, i) return None, -1 - @staticmethod - def convert_charref(name): + def convert_charref(self, name): """ :type name: str :rtype: str @@ -324,8 +313,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object): return '&#%s;' % name - @staticmethod - def convert_entityref(name): + def convert_entityref(self, name): """ :type name: str :rtype: str @@ -349,7 +337,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object): try: return sgmllib.SGMLParser.parse_declaration(self, i) - except sgmllib.SGMLParseError: + except (AssertionError, sgmllib.SGMLParseError): # Escape the doctype declaration and continue parsing. self.handle_data('<') return i+1 diff --git a/lib/feedparser/http.py b/lib/feedparser/http.py index 97d67cbd..a7fee361 100644 --- a/lib/feedparser/http.py +++ b/lib/feedparser/http.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -44,7 +44,7 @@ from .urls import convert_to_idn ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" -class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler): +class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler): def http_error_default(self, req, fp, code, msg, headers): # The default implementation just raises HTTPError. # Forget that. @@ -53,6 +53,8 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR def http_error_301(self, req, fp, code, msg, hdrs): result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs) + if not result: + return fp result.status = code result.newurl = result.geturl() return result @@ -78,7 +80,7 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR host = urllib.parse.urlparse(req.get_full_url())[1] if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers: return self.http_error_default(req, fp, code, msg, headers) - auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode('utf8')) + auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode() user, passw = auth.split(':') realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] self.add_password(realm, host, user, passw) @@ -145,15 +147,26 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, if url_pieces.port: new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}' url = urllib.parse.urlunparse(new_pieces) - auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}').strip() + auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode() # iri support if not isinstance(url, bytes): url = convert_to_idn(url) + # Prevent UnicodeEncodeErrors caused by Unicode characters in the path. + bits = [] + for c in url: + try: + c.encode('ascii') + except UnicodeEncodeError: + bits.append(urllib.parse.quote(c)) + else: + bits.append(c) + url = ''.join(bits) + # try to open with urllib2 (to use optional headers) request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers) - opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()])) + opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()])) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent f = opener.open(request) data = f.read() @@ -203,7 +216,7 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, result['href'] = f.url.decode('utf-8', 'ignore') else: result['href'] = f.url - result['status'] = getattr(f, 'status', 200) + result['status'] = getattr(f, 'status', None) or 200 # Stop processing if the server sent HTTP 304 Not Modified. if getattr(f, 'code', 0) == 304: diff --git a/lib/feedparser/mixin.py b/lib/feedparser/mixin.py index f305015b..8309e723 100644 --- a/lib/feedparser/mixin.py +++ b/lib/feedparser/mixin.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -30,16 +30,17 @@ import binascii import copy import html.entities import re +from typing import Dict import xml.sax.saxutils from .html import _cp1252 from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc -from .sanitizer import _sanitize_html, _HTMLSanitizer +from .sanitizer import sanitize_html, HTMLSanitizer from .util import FeedParserDict from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris -class _FeedParserMixin( +class XMLParserMixin( _base.Namespace, cc.Namespace, dc.Namespace, @@ -118,7 +119,7 @@ class _FeedParserMixin( 'http://www.w3.org/XML/1998/namespace': 'xml', 'http://podlove.org/simple-chapters': 'psc', } - _matchnamespaces = {} + _matchnamespaces: Dict[str, str] = {} can_be_relative_uri = { 'comments', @@ -170,6 +171,8 @@ class _FeedParserMixin( self.entries = [] # list of entry-level data self.version = '' # feed type/version, see SUPPORTED_VERSIONS self.namespaces_in_use = {} # dictionary of namespaces defined by the feed + self.resolve_relative_uris = False + self.sanitize_html = False # the following are used internally to track state; # this is really out of control and should be refactored @@ -193,6 +196,7 @@ class _FeedParserMixin( self.svgOK = 0 self.title_depth = -1 self.depth = 0 + self.hasContent = 0 if self.lang: self.feeddata['language'] = self.lang.replace('_', '-') @@ -204,7 +208,7 @@ class _FeedParserMixin( # }, # } self.property_depth_map = {} - super(_FeedParserMixin, self).__init__() + super(XMLParserMixin, self).__init__() def _normalize_attributes(self, kv): raise NotImplementedError @@ -506,9 +510,7 @@ class _FeedParserMixin( if base64 and self.contentparams.get('base64', 0): try: output = base64.decodebytes(output.encode('utf8')).decode('utf8') - except binascii.Error: - pass - except binascii.Incomplete: + except (binascii.Error, binascii.Incomplete, UnicodeDecodeError): pass # resolve relative URIs @@ -546,7 +548,7 @@ class _FeedParserMixin( # sanitize embedded markup if is_htmlish and self.sanitize_html: if element in self.can_contain_dangerous_markup: - output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html')) + output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html')) if self.encoding and isinstance(output, bytes): output = output.decode(self.encoding, 'ignore') @@ -648,7 +650,7 @@ class _FeedParserMixin( return False # all tags must be in a restricted subset of valid HTML tags - if any((t for t in re.findall(r' +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -259,6 +259,7 @@ class Namespace(object): def _end_item(self): self.pop('item') self.inentry = 0 + self.hasContent = 0 _end_entry = _end_item def _start_language(self, attrs_d): @@ -388,7 +389,7 @@ class Namespace(object): def _start_description(self, attrs_d): context = self._get_context() - if 'summary' in context: + if 'summary' in context and not self.hasContent: self._summaryKey = 'content' self._start_content(attrs_d) else: @@ -429,7 +430,7 @@ class Namespace(object): def _start_summary(self, attrs_d): context = self._get_context() - if 'summary' in context: + if 'summary' in context and not self.hasContent: self._summaryKey = 'content' self._start_content(attrs_d) else: @@ -466,6 +467,7 @@ class Namespace(object): self.sourcedata.clear() def _start_content(self, attrs_d): + self.hasContent = 1 self.push_content('content', attrs_d, 'text/plain', 1) src = attrs_d.get('src') if src: @@ -477,6 +479,7 @@ class Namespace(object): _start_xhtml_body = _start_body def _start_content_encoded(self, attrs_d): + self.hasContent = 1 self.push_content('content', attrs_d, 'text/html', 1) _start_fullitem = _start_content_encoded diff --git a/lib/feedparser/namespaces/admin.py b/lib/feedparser/namespaces/admin.py index 2dca7a02..74218348 100644 --- a/lib/feedparser/namespaces/admin.py +++ b/lib/feedparser/namespaces/admin.py @@ -1,5 +1,5 @@ # Support for the administrative elements extension -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/namespaces/cc.py b/lib/feedparser/namespaces/cc.py index da1a4cee..6735c5fe 100644 --- a/lib/feedparser/namespaces/cc.py +++ b/lib/feedparser/namespaces/cc.py @@ -1,5 +1,5 @@ # Support for the Creative Commons licensing extensions -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/namespaces/dc.py b/lib/feedparser/namespaces/dc.py index feabdd0a..a89221d2 100644 --- a/lib/feedparser/namespaces/dc.py +++ b/lib/feedparser/namespaces/dc.py @@ -1,5 +1,5 @@ # Support for the Dublin Core metadata extensions -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/namespaces/georss.py b/lib/feedparser/namespaces/georss.py index 8d8b253b..786a926f 100644 --- a/lib/feedparser/namespaces/georss.py +++ b/lib/feedparser/namespaces/georss.py @@ -1,5 +1,5 @@ # Support for the GeoRSS format -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -91,6 +91,8 @@ class Namespace(object): except ValueError: srs_dimension = 2 context = self._get_context() + if 'where' not in context: + context['where'] = {} context['where']['srsName'] = srs_name context['where']['srsDimension'] = srs_dimension diff --git a/lib/feedparser/namespaces/itunes.py b/lib/feedparser/namespaces/itunes.py index abcfc243..a50a0ea8 100644 --- a/lib/feedparser/namespaces/itunes.py +++ b/lib/feedparser/namespaces/itunes.py @@ -1,5 +1,5 @@ # Support for the iTunes format -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/namespaces/mediarss.py b/lib/feedparser/namespaces/mediarss.py index b6374387..2298ad2f 100644 --- a/lib/feedparser/namespaces/mediarss.py +++ b/lib/feedparser/namespaces/mediarss.py @@ -1,5 +1,5 @@ # Support for the Media RSS format -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/namespaces/psc.py b/lib/feedparser/namespaces/psc.py index c7f565e0..a440bd68 100644 --- a/lib/feedparser/namespaces/psc.py +++ b/lib/feedparser/namespaces/psc.py @@ -1,5 +1,5 @@ # Support for the Podlove Simple Chapters format -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # diff --git a/lib/feedparser/parsers/json.py b/lib/feedparser/parsers/json.py new file mode 100644 index 00000000..ae43163c --- /dev/null +++ b/lib/feedparser/parsers/json.py @@ -0,0 +1,133 @@ +# The JSON feed parser +# Copyright 2017 Beat Bolli +# All rights reserved. +# +# This file is a part of feedparser. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import json + +from ..datetimes import _parse_date +from ..sanitizer import sanitize_html +from ..util import FeedParserDict + + +class JSONParser: + VERSIONS = { + 'https://jsonfeed.org/version/1': 'json1', + 'https://jsonfeed.org/version/1.1': 'json11', + } + FEED_FIELDS = ( + ('title', 'title'), + ('icon', 'image'), + ('home_page_url', 'link'), + ('description', 'description'), + ) + ITEM_FIELDS = ( + ('title', 'title'), + ('id', 'guid'), + ('url', 'link'), + ('summary', 'summary'), + ('external_url', 'source'), + ) + + def __init__(self, baseuri=None, baselang=None, encoding=None): + self.baseuri = baseuri or '' + self.lang = baselang or None + self.encoding = encoding or 'utf-8' # character encoding + + self.version = None + self.feeddata = FeedParserDict() + self.namespacesInUse = [] + self.entries = [] + + def feed(self, data): + data = json.loads(data) + + v = data.get('version', '') + try: + self.version = self.VERSIONS[v] + except KeyError: + raise ValueError("Unrecognized JSONFeed version '%s'" % v) + + for src, dst in self.FEED_FIELDS: + if src in data: + self.feeddata[dst] = data[src] + if 'author' in data: + self.parse_author(data['author'], self.feeddata) + # TODO: hubs; expired has no RSS equivalent + + self.entries = [self.parse_entry(e) for e in data['items']] + + def parse_entry(self, e): + entry = FeedParserDict() + for src, dst in self.ITEM_FIELDS: + if src in e: + entry[dst] = e[src] + + if 'content_text' in e: + entry['content'] = c = FeedParserDict() + c['value'] = e['content_text'] + c['type'] = 'text' + elif 'content_html' in e: + entry['content'] = c = FeedParserDict() + c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json') + c['type'] = 'html' + + if 'date_published' in e: + entry['published'] = e['date_published'] + entry['published_parsed'] = _parse_date(e['date_published']) + if 'date_updated' in e: + entry['updated'] = e['date_modified'] + entry['updated_parsed'] = _parse_date(e['date_modified']) + + if 'tags' in e: + entry['category'] = e['tags'] + + if 'author' in e: + self.parse_author(e['author'], entry) + + if 'attachments' in e: + entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']] + + return entry + + @staticmethod + def parse_author(parent, dest): + dest['author_detail'] = detail = FeedParserDict() + if 'name' in parent: + dest['author'] = detail['name'] = parent['name'] + if 'url' in parent: + if parent['url'].startswith('mailto:'): + detail['email'] = parent['url'][7:] + else: + detail['href'] = parent['url'] + + @staticmethod + def parse_attachment(attachment): + enc = FeedParserDict() + enc['href'] = attachment['url'] + enc['type'] = attachment['mime_type'] + if 'size_in_bytes' in attachment: + enc['length'] = attachment['size_in_bytes'] + return enc diff --git a/lib/feedparser/parsers/loose.py b/lib/feedparser/parsers/loose.py index 52467d0f..3f22bfb4 100644 --- a/lib/feedparser/parsers/loose.py +++ b/lib/feedparser/parsers/loose.py @@ -1,5 +1,5 @@ # The loose feed parser that interfaces with an SGML parsing library -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,7 +26,7 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -class _LooseFeedParser(object): +class LooseXMLParser: contentparams = None def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None): @@ -34,7 +34,7 @@ class _LooseFeedParser(object): self.lang = baselang or None self.encoding = encoding or 'utf-8' # character encoding self.entities = entities or {} - super(_LooseFeedParser, self).__init__() + super().__init__() @staticmethod def _normalize_attributes(kv): diff --git a/lib/feedparser/parsers/strict.py b/lib/feedparser/parsers/strict.py index 23759f93..7b0386e5 100644 --- a/lib/feedparser/parsers/strict.py +++ b/lib/feedparser/parsers/strict.py @@ -1,5 +1,5 @@ # The strict feed parser that interfaces with an XML parsing library -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -29,7 +29,7 @@ from ..exceptions import UndeclaredNamespace -class _StrictFeedParser(object): +class StrictXMLParser: def __init__(self, baseuri, baselang, encoding): self.bozo = 0 self.exc = None @@ -37,7 +37,7 @@ class _StrictFeedParser(object): self.baseuri = baseuri or '' self.lang = baselang self.encoding = encoding - super(_StrictFeedParser, self).__init__() + super(StrictXMLParser, self).__init__() @staticmethod def _normalize_attributes(kv): diff --git a/lib/feedparser/sanitizer.py b/lib/feedparser/sanitizer.py index 308db7c3..5b729830 100644 --- a/lib/feedparser/sanitizer.py +++ b/lib/feedparser/sanitizer.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,11 +27,11 @@ import re -from .html import _BaseHTMLProcessor +from .html import BaseHTMLProcessor from .urls import make_safe_absolute_uri -class _HTMLSanitizer(_BaseHTMLProcessor): +class HTMLSanitizer(BaseHTMLProcessor): acceptable_elements = { 'a', 'abbr', @@ -732,14 +732,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor): } def __init__(self, encoding=None, _type='application/xhtml+xml'): - super(_HTMLSanitizer, self).__init__(encoding, _type) + super().__init__(encoding, _type) self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 def reset(self): - super(_HTMLSanitizer, self).reset() + super().reset() self.unacceptablestack = 0 self.mathmlOK = 0 self.svgOK = 0 @@ -805,7 +805,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): if key == 'href': value = make_safe_absolute_uri(value) clean_attrs.append((key, value)) - super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs) + super().unknown_starttag(tag, clean_attrs) def unknown_endtag(self, tag): if tag not in self.acceptable_elements: @@ -820,7 +820,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): self.svgOK -= 1 else: return - super(_HTMLSanitizer, self).unknown_endtag(tag) + super().unknown_endtag(tag) def handle_pi(self, text): pass @@ -830,7 +830,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): def handle_data(self, text): if not self.unacceptablestack: - super(_HTMLSanitizer, self).handle_data(text) + super().handle_data(text) def sanitize_style(self, style): # disallow urls @@ -865,7 +865,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): return ' '.join(clean) def parse_comment(self, i, report=1): - ret = super(_HTMLSanitizer, self).parse_comment(i, report) + ret = super().parse_comment(i, report) if ret >= 0: return ret # if ret == -1, this may be a malicious attempt to circumvent @@ -877,8 +877,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor): return len(self.rawdata) -def _sanitize_html(html_source, encoding, _type): - p = _HTMLSanitizer(encoding, _type) +def sanitize_html(html_source, encoding, _type): + p = HTMLSanitizer(encoding, _type) html_source = html_source.replace(' +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,7 +27,7 @@ import re -import sgmllib +import sgmllib # type: ignore[import] __all__ = [ 'sgmllib', @@ -82,7 +82,7 @@ class _EndBracketRegEx: match = self.endbracket.match(target, index) if match is not None: # Returning a new object in the calling thread's context - # resolves a thread-safety. + # resolves a thread-safety issue. return EndBracketMatch(match) return None diff --git a/lib/feedparser/urls.py b/lib/feedparser/urls.py index c27fdd12..623f030a 100644 --- a/lib/feedparser/urls.py +++ b/lib/feedparser/urls.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -28,7 +28,7 @@ import re import urllib.parse -from .html import _BaseHTMLProcessor +from .html import BaseHTMLProcessor # If you want feedparser to allow all URL schemes, set this to () # List culled from Python's urlparse documentation at: @@ -103,7 +103,7 @@ def make_safe_absolute_uri(base, rel=None): return uri -class RelativeURIResolver(_BaseHTMLProcessor): +class RelativeURIResolver(BaseHTMLProcessor): relative_uris = { ('a', 'href'), ('applet', 'codebase'), @@ -137,7 +137,7 @@ class RelativeURIResolver(_BaseHTMLProcessor): } def __init__(self, baseuri, encoding, _type): - _BaseHTMLProcessor.__init__(self, encoding, _type) + BaseHTMLProcessor.__init__(self, encoding, _type) self.baseuri = baseuri def resolve_uri(self, uri): diff --git a/lib/feedparser/util.py b/lib/feedparser/util.py index 5b731307..9e1516cf 100644 --- a/lib/feedparser/util.py +++ b/lib/feedparser/util.py @@ -1,4 +1,4 @@ -# Copyright 2010-2020 Kurt McKee +# Copyright 2010-2022 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -48,7 +48,7 @@ class FeedParserDict(dict): 'tagline_detail': 'subtitle_detail', } - def __getitem__(self, key): + def __getitem__(self, key, _stacklevel=2): """ :return: A :class:`FeedParserDict`. """ @@ -59,9 +59,8 @@ class FeedParserDict(dict): except IndexError: raise KeyError("object doesn't have key 'category'") elif key == 'enclosures': - norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel']) return [ - norel(link) + FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel']) for link in dict.__getitem__(self, 'links') if link['rel'] == 'enclosure' ] @@ -84,6 +83,7 @@ class FeedParserDict(dict): "exist. This fallback will be removed in a future version " "of feedparser.", DeprecationWarning, + stacklevel=_stacklevel, ) return dict.__getitem__(self, 'published') return dict.__getitem__(self, 'updated') @@ -99,6 +99,7 @@ class FeedParserDict(dict): "`updated_parsed` doesn't exist. This fallback will be " "removed in a future version of feedparser.", DeprecationWarning, + stacklevel=_stacklevel, ) return dict.__getitem__(self, 'published_parsed') return dict.__getitem__(self, 'updated_parsed') @@ -119,7 +120,7 @@ class FeedParserDict(dict): # This fix was proposed in issue 328. return dict.__contains__(self, key) try: - self.__getitem__(key) + self.__getitem__(key, _stacklevel=3) except KeyError: return False else: @@ -133,7 +134,7 @@ class FeedParserDict(dict): """ try: - return self.__getitem__(key) + return self.__getitem__(key, _stacklevel=3) except KeyError: return default @@ -143,17 +144,11 @@ class FeedParserDict(dict): key = key[0] return dict.__setitem__(self, key, value) - def setdefault(self, k, default): - if k not in self: - self[k] = default - return default - return self[k] - def __getattr__(self, key): # __getattribute__() is called first; this will be called # only if an attribute was not already found try: - return self.__getitem__(key) + return self.__getitem__(key, _stacklevel=3) except KeyError: raise AttributeError("object has no attribute '%s'" % key)