From 864d8fffac0c0294908a5997b7d8a684ab900c60 Mon Sep 17 00:00:00 2001 From: JackDandy Date: Thu, 13 Apr 2023 08:04:58 +0100 Subject: [PATCH] =?UTF-8?q?Update=20feedparser=206.0.10=20(6d032b8)=20?= =?UTF-8?q?=E2=86=92=206.0.10=20(859ac57).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 1 + lib/feedparser/__init__.py | 33 +- lib/feedparser/api.py | 360 +++--- lib/feedparser/datetimes/__init__.py | 5 +- lib/feedparser/datetimes/asctime.py | 43 +- lib/feedparser/datetimes/greek.py | 82 +- lib/feedparser/datetimes/hungarian.py | 44 +- lib/feedparser/datetimes/iso8601.py | 114 +- lib/feedparser/datetimes/korean.py | 59 +- lib/feedparser/datetimes/perforce.py | 25 +- lib/feedparser/datetimes/rfc822.py | 71 +- lib/feedparser/datetimes/w3dtf.py | 58 +- lib/feedparser/encodings.py | 558 ++++++++-- lib/feedparser/exceptions.py | 12 +- lib/feedparser/html.py | 177 +-- lib/feedparser/http.py | 233 +--- lib/feedparser/mixin.py | 598 +++++----- lib/feedparser/namespaces/_base.py | 366 ++++--- lib/feedparser/namespaces/admin.py | 20 +- lib/feedparser/namespaces/cc.py | 39 +- lib/feedparser/namespaces/dc.py | 38 +- lib/feedparser/namespaces/georss.py | 577 ++++++++-- lib/feedparser/namespaces/itunes.py | 46 +- lib/feedparser/namespaces/mediarss.py | 73 +- lib/feedparser/namespaces/psc.py | 26 +- lib/feedparser/parsers/json.py | 102 +- lib/feedparser/parsers/loose.py | 56 +- lib/feedparser/parsers/strict.py | 62 +- lib/feedparser/py.typed | 0 lib/feedparser/sanitizer.py | 1462 +++++++++++++------------ lib/feedparser/sgml.py | 46 +- lib/feedparser/urls.py | 163 +-- lib/feedparser/util.py | 80 +- 33 files changed, 3280 insertions(+), 2349 deletions(-) create mode 100644 lib/feedparser/py.typed diff --git a/CHANGES.md b/CHANGES.md index 9efccead..50c5764c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ * Update attr 22.2.0 (a9960de) to 22.2.0 (683d056) * Update diskcache 5.4.0 (1cb1425) to 5.6.1 (4d30686) +* Update feedparser 6.0.10 (5fcb3ae) to 6.0.10 (6d032b8) * Update filelock 3.9.0 (ce3e891) to 3.11.0 (d3241b9) * Update Msgpack 1.0.4 (b5acfd5) to 1.0.5 (0516c2c) * Update Requests library 2.28.1 (ec553c2) to 2.29.0 (87d63de) diff --git a/lib/feedparser/__init__.py b/lib/feedparser/__init__.py index 1e8877c0..bdf8060c 100644 --- a/lib/feedparser/__init__.py +++ b/lib/feedparser/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,12 +27,18 @@ from .api import parse from .datetimes import registerDateHandler -from .exceptions import * +from .exceptions import ( + CharacterEncodingOverride, + CharacterEncodingUnknown, + FeedparserError, + NonXMLContentType, + UndeclaredNamespace, +) from .util import FeedParserDict -__author__ = 'Kurt McKee ' -__license__ = 'BSD 2-clause' -__version__ = '6.0.10' +__author__ = "Kurt McKee " +__license__ = "BSD 2-clause" +__version__ = "6.0.10" # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should @@ -46,3 +52,20 @@ RESOLVE_RELATIVE_URIS = 1 # If you want feedparser to automatically sanitize all potentially unsafe # HTML content, set this to 1. SANITIZE_HTML = 1 + + +# If you want feedparser to use only a prefix of the feed to detect encodings +# (uses less memory), set this to 1. +OPTIMISTIC_ENCODING_DETECTION = 1 + + +__all__ = ( + "parse", + "registerDateHandler", + "FeedParserDict", + "FeedparserError", + "CharacterEncodingOverride", + "CharacterEncodingUnknown", + "NonXMLContentType", + "UndeclaredNamespace", +) diff --git a/lib/feedparser/api.py b/lib/feedparser/api.py index c56237be..fe49c56d 100644 --- a/lib/feedparser/api.py +++ b/lib/feedparser/api.py @@ -1,5 +1,5 @@ # The public API for feedparser -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,29 +26,23 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import datetime import io -import time -from typing import Dict, List, Union import urllib.error import urllib.parse import xml.sax +from typing import IO, Dict, Optional, Union -import sgmllib3k as sgmllib - -from .datetimes import registerDateHandler, _parse_date -from .encodings import convert_to_utf8 -from .html import BaseHTMLProcessor from . import http +from .encodings import MissingEncoding, convert_file_to_utf8 +from .html import BaseHTMLProcessor from .mixin import XMLParserMixin +from .parsers.json import JSONParser from .parsers.loose import LooseXMLParser from .parsers.strict import StrictXMLParser -from .parsers.json import JSONParser from .sanitizer import replace_doctype -from .urls import convert_to_idn, make_safe_absolute_uri +from .urls import make_safe_absolute_uri from .util import FeedParserDict - # List of preferred XML parsers, by SAX driver name. These will be tried first, # but if they're not installed, Python will keep searching through its own list # of pre-installed parsers until it finds one that supports everything we need. @@ -57,27 +51,30 @@ PREFERRED_XML_PARSERS = ["drv_libxml2"] _XML_AVAILABLE = True SUPPORTED_VERSIONS = { - '': 'unknown', - 'rss090': 'RSS 0.90', - 'rss091n': 'RSS 0.91 (Netscape)', - 'rss091u': 'RSS 0.91 (Userland)', - 'rss092': 'RSS 0.92', - 'rss093': 'RSS 0.93', - 'rss094': 'RSS 0.94', - 'rss20': 'RSS 2.0', - 'rss10': 'RSS 1.0', - 'rss': 'RSS (unknown version)', - 'atom01': 'Atom 0.1', - 'atom02': 'Atom 0.2', - 'atom03': 'Atom 0.3', - 'atom10': 'Atom 1.0', - 'atom': 'Atom (unknown version)', - 'cdf': 'CDF', - 'json1': 'JSON feed 1', + "": "unknown", + "rss090": "RSS 0.90", + "rss091n": "RSS 0.91 (Netscape)", + "rss091u": "RSS 0.91 (Userland)", + "rss092": "RSS 0.92", + "rss093": "RSS 0.93", + "rss094": "RSS 0.94", + "rss20": "RSS 2.0", + "rss10": "RSS 1.0", + "rss": "RSS (unknown version)", + "atom01": "Atom 0.1", + "atom02": "Atom 0.2", + "atom03": "Atom 0.3", + "atom10": "Atom 1.0", + "atom": "Atom (unknown version)", + "cdf": "CDF", + "json1": "JSON feed 1", } -def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result): +def _open_resource( + url_file_stream_or_string, + result, +): """URL, filename, or string --> stream This function lets you define parsers that take any input source @@ -86,43 +83,44 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. - If the etag argument is supplied, it will be used as the value of an - If-None-Match request header. - - If the modified argument is supplied, it can be a tuple of 9 integers - (as returned by gmtime() in the standard Python time module) or a date - string in any format supported by feedparser. Regardless, it MUST - be in GMT (Greenwich Mean Time). It will be reformatted into an - RFC 1123-compliant date and used as the value of an If-Modified-Since - request header. - - If the agent argument is supplied, it will be used as the value of a - User-Agent request header. - - If the referrer argument is supplied, it will be used as the value of a - Referer[sic] request header. - - If handlers is supplied, it is a list of handlers used to build a - urllib2 opener. - - if request_headers is supplied it is a dictionary of HTTP request headers - that will override the values generated by FeedParser. - - :return: A bytes object. + :return: A seekable, readable file object. """ - if hasattr(url_file_stream_or_string, 'read'): - return url_file_stream_or_string.read() + # Some notes on the history of the implementation of _open_resource(). + # + # parse() might need to go over the feed content twice: + # if the strict parser fails, it tries again with the loose parser. + # + # In 5.2.0, this returned an open file, to be read() by parse(). + # By 6.0.8, this returned bytes directly. + # + # Since #296 (>6.0.8), this once again returns an open file + # (to reduce memory usage, see convert_file_to_utf8() for details). + # However, to accommodate parse() needing the content twice, + # the returned file is guaranteed to be seekable. + # (If the underlying resource is not seekable, + # the content is read and wrapped in a io.BytesIO/StringIO.) - if isinstance(url_file_stream_or_string, str) \ - and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): - return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + if callable(getattr(url_file_stream_or_string, "read", None)): + if callable(getattr(url_file_stream_or_string, "seekable", None)): + if url_file_stream_or_string.seekable(): + return url_file_stream_or_string + return _to_in_memory_file(url_file_stream_or_string.read()) + + looks_like_url = isinstance( + url_file_stream_or_string, str + ) and urllib.parse.urlparse(url_file_stream_or_string)[0] in ( + "http", + "https", + ) + if looks_like_url: + data = http.get(url_file_stream_or_string, result) + return io.BytesIO(data) # try to open with native open function (if url_file_stream_or_string is a filename) try: - with open(url_file_stream_or_string, 'rb') as f: - data = f.read() - except (IOError, UnicodeEncodeError, TypeError, ValueError): + return open(url_file_stream_or_string, "rb") + except (OSError, TypeError, ValueError): # if url_file_stream_or_string is a str object that # cannot be converted to the encoding returned by # sys.getfilesystemencoding(), a UnicodeEncodeError @@ -131,33 +129,32 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # (such as an XML document encoded in UTF-32), TypeError will # be thrown. pass - else: - return data - # treat url_file_stream_or_string as string - if not isinstance(url_file_stream_or_string, bytes): - return url_file_stream_or_string.encode('utf-8') - return url_file_stream_or_string + # treat url_file_stream_or_string as bytes/string + return _to_in_memory_file(url_file_stream_or_string) + + +def _to_in_memory_file(data): + if isinstance(data, str): + return io.StringIO(data) + else: + return io.BytesIO(data) class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor): pass + class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler): pass def parse( - url_file_stream_or_string, - etag: str = None, - modified: Union[str, datetime.datetime, time.struct_time] = None, - agent: str = None, - referrer: str = None, - handlers: List = None, - request_headers: Dict[str, str] = None, - response_headers: Dict[str, str] = None, - resolve_relative_uris: bool = None, - sanitize_html: bool = None, + url_file_stream_or_string, + response_headers: Optional[Dict[str, str]] = None, + resolve_relative_uris: Optional[bool] = None, + sanitize_html: Optional[bool] = None, + optimistic_encoding_detection: Optional[bool] = None, ) -> FeedParserDict: """Parse a feed from a URL, file, stream, or string. @@ -174,20 +171,6 @@ def parse( When a URL is not passed the feed location to use in relative URL resolution should be passed in the ``Content-Location`` response header (see ``response_headers`` below). - :param etag: - HTTP ``ETag`` request header. - :param modified: - HTTP ``Last-Modified`` request header. - :param agent: - HTTP ``User-Agent`` request header, which defaults to - the value of :data:`feedparser.USER_AGENT`. - :param referrer: - HTTP ``Referer`` [sic] request header. - :param handlers: - A list of handlers that will be passed to urllib2. - :param request_headers: - A mapping of HTTP header name to HTTP header value to add to the - request, overriding internally generated values. :param response_headers: A mapping of HTTP header name to HTTP header value. Multiple values may be joined with a comma. If a HTTP request was made, these headers @@ -201,20 +184,14 @@ def parse( Should feedparser skip HTML sanitization? Only disable this if you know what you are doing! Defaults to the value of :data:`feedparser.SANITIZE_HTML`, which is ``True``. + :param optimistic_encoding_detection: + Should feedparser use only a prefix of the feed to detect encodings + (uses less memory, but the wrong encoding may be detected in rare cases). + Defaults to the value of + :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``. """ - # Avoid a cyclic import. - if not agent: - import feedparser - agent = feedparser.USER_AGENT - if sanitize_html is None: - import feedparser - sanitize_html = bool(feedparser.SANITIZE_HTML) - if resolve_relative_uris is None: - import feedparser - resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS) - result = FeedParserDict( bozo=False, entries=[], @@ -223,50 +200,110 @@ def parse( ) try: - data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) + file = _open_resource( + url_file_stream_or_string, + result, + ) except urllib.error.URLError as error: - result.update({ - 'bozo': True, - 'bozo_exception': error, - }) + result.update( + { + "bozo": True, + "bozo_exception": error, + } + ) return result - if not data: + # at this point, the file is guaranteed to be seekable; + # we read 1 byte/character to see if it's empty and return early + # (this preserves the behavior in 6.0.8) + initial_file_offset = file.tell() + if not file.read(1): return result + file.seek(initial_file_offset) # overwrite existing headers using response_headers - result['headers'].update(response_headers or {}) + result["headers"].update(response_headers or {}) - data = convert_to_utf8(result['headers'], data, result) - use_json_parser = result['content-type'] == 'application/json' - use_strict_parser = result['encoding'] and True or False + try: + _parse_file_inplace( + file, + result, + resolve_relative_uris=resolve_relative_uris, + sanitize_html=sanitize_html, + optimistic_encoding_detection=optimistic_encoding_detection, + ) + finally: + if not hasattr(url_file_stream_or_string, "read"): + # the file does not come from the user, close it + file.close() - if not use_json_parser: - result['version'], data, entities = replace_doctype(data) + return result + + +def _parse_file_inplace( + file: Union[IO[bytes], IO[str]], + result: dict, + *, + resolve_relative_uris: Optional[bool] = None, + sanitize_html: Optional[bool] = None, + optimistic_encoding_detection: Optional[bool] = None, +) -> None: + # Avoid a cyclic import. + import feedparser + + if sanitize_html is None: + sanitize_html = bool(feedparser.SANITIZE_HTML) + if resolve_relative_uris is None: + resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS) + if optimistic_encoding_detection is None: + optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION) + + stream_factory = convert_file_to_utf8( + result["headers"], file, result, optimistic_encoding_detection + ) + # We're done with file, all access must happen through stream_factory. + del file + + # Some notes about the stream_factory.get_{text,binary}_file() methods: + # + # Calling them a second time will raise io.UnsupportedOperation + # if the underlying file was not seekable. + # + # Calling close() on the returned file is ignored + # (that is, the underlying file is *not* closed), + # because the SAX parser closes the file when done; + # we don't want that, since we might try again with the loose parser. + + use_json_parser = False + if result["content-type"] in {"application/json", "application/feed+json"}: + use_json_parser = True + use_strict_parser = bool(result["encoding"]) + + result["version"], stream_factory.prefix, entities = replace_doctype( + stream_factory.prefix + ) # Ensure that baseuri is an absolute URI using an acceptable URI scheme. - contentloc = result['headers'].get('content-location', '') - href = result.get('href', '') - baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href + contentloc = result["headers"].get("content-location", "") + href = result.get("href", "") + baseuri = ( + make_safe_absolute_uri(href, contentloc) + or make_safe_absolute_uri(contentloc) + or href + ) - baselang = result['headers'].get('content-language', None) + baselang = result["headers"].get("content-language", None) if isinstance(baselang, bytes) and baselang is not None: - baselang = baselang.decode('utf-8', 'ignore') + baselang = baselang.decode("utf-8", "ignore") if not _XML_AVAILABLE: use_strict_parser = False + feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser] - if use_json_parser: - result['version'] = None - feed_parser = JSONParser(baseuri, baselang, 'utf-8') - try: - feed_parser.feed(data) - except Exception as e: - result['bozo'] = 1 - result['bozo_exception'] = e - elif use_strict_parser: + + if use_strict_parser and not use_json_parser: # Initialize the SAX parser. - feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8') + feed_parser = StrictFeedParser(baseuri, baselang, "utf-8") feed_parser.resolve_relative_uris = resolve_relative_uris feed_parser.sanitize_html = sanitize_html saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) @@ -279,27 +316,62 @@ def parse( saxparser.setContentHandler(feed_parser) saxparser.setErrorHandler(feed_parser) source = xml.sax.xmlreader.InputSource() - source.setByteStream(io.BytesIO(data)) + + # If an encoding was detected, decode the file on the fly; + # otherwise, pass it as-is and let the SAX parser deal with it. + try: + source.setCharacterStream(stream_factory.get_text_file()) + except MissingEncoding: + source.setByteStream(stream_factory.get_binary_file()) + try: saxparser.parse(source) except xml.sax.SAXException as e: - result['bozo'] = 1 - result['bozo_exception'] = feed_parser.exc or e + result["bozo"] = 1 + result["bozo_exception"] = feed_parser.exc or e use_strict_parser = False - # The loose XML parser will be tried if the JSON parser was not used, - # and if the strict XML parser was not used (or if it failed). - if not use_json_parser and not use_strict_parser: - feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) + # The loose XML parser will be tried if the strict XML parser was not used + # (or if it failed to parse the feed). + if not use_strict_parser and not use_json_parser: + feed_parser = LooseFeedParser(baseuri, baselang, "utf-8", entities) feed_parser.resolve_relative_uris = resolve_relative_uris feed_parser.sanitize_html = sanitize_html - feed_parser.feed(data.decode('utf-8', 'replace')) - result['feed'] = feed_parser.feeddata - result['entries'] = feed_parser.entries - result['version'] = result['version'] or feed_parser.version + # If an encoding was detected, use it; otherwise, assume utf-8 and do your best. + # Will raise io.UnsupportedOperation if the underlying file is not seekable. + data = stream_factory.get_text_file("utf-8", "replace").read() + + # As of 6.0.8, LooseFeedParser.feed() can be called exactly once + # with the entire data (it does some re.sub() and str.replace() on it). + # + # SGMLParser (of which LooseFeedParser is a subclass) + # *can* be fed in a streaming fashion, + # by calling feed() repeatedly with chunks of text. + # + # When/if LooseFeedParser will support being fed chunks, + # replace the read() call above with read(size)/feed() calls in a loop. + + feed_parser.feed(data) + + # If parsing with the loose XML parser resulted in no information, + # flag that the JSON parser should be tried. + if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version): + use_json_parser = True + + if use_json_parser: + result["version"] = None + feed_parser = JSONParser(baseuri, baselang, "utf-8") + try: + feed_parser.feed(stream_factory.get_file()) + except Exception as e: + result["bozo"] = 1 + result["bozo_exception"] = e + + result["feed"] = feed_parser.feeddata + result["entries"] = feed_parser.entries + result["version"] = result["version"] or feed_parser.version if isinstance(feed_parser, JSONParser): - result['namespaces'] = {} - else: - result['namespaces'] = feed_parser.namespaces_in_use - return result + result["namespaces"] = {} + else: + result["namespaces"] = feed_parser.namespaces_in_use diff --git a/lib/feedparser/datetimes/__init__.py b/lib/feedparser/datetimes/__init__.py index 9e09ec27..36b4857d 100644 --- a/lib/feedparser/datetimes/__init__.py +++ b/lib/feedparser/datetimes/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,11 +27,12 @@ from time import struct_time from typing import Callable, List, Optional + from .asctime import _parse_date_asctime from .greek import _parse_date_greek from .hungarian import _parse_date_hungarian from .iso8601 import _parse_date_iso8601 -from .korean import _parse_date_onblog, _parse_date_nate +from .korean import _parse_date_nate, _parse_date_onblog from .perforce import _parse_date_perforce from .rfc822 import _parse_date_rfc822 from .w3dtf import _parse_date_w3dtf diff --git a/lib/feedparser/datetimes/asctime.py b/lib/feedparser/datetimes/asctime.py index c4b16249..5b75ca54 100644 --- a/lib/feedparser/datetimes/asctime.py +++ b/lib/feedparser/datetimes/asctime.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -28,18 +28,18 @@ from .rfc822 import _parse_date_rfc822 _months = [ - 'jan', - 'feb', - 'mar', - 'apr', - 'may', - 'jun', - 'jul', - 'aug', - 'sep', - 'oct', - 'nov', - 'dec', + "jan", + "feb", + "mar", + "apr", + "may", + "jun", + "jul", + "aug", + "sep", + "oct", + "nov", + "dec", ] @@ -59,13 +59,22 @@ def _parse_date_asctime(dt): # Insert a GMT timezone, if needed. if len(parts) == 5: - parts.insert(4, '+0000') + parts.insert(4, "+0000") # Exit if there are not six parts. if len(parts) != 6: return None # Reassemble the parts in an RFC822-compatible order and parse them. - return _parse_date_rfc822(' '.join([ - parts[0], parts[2], parts[1], parts[5], parts[3], parts[4], - ])) + return _parse_date_rfc822( + " ".join( + [ + parts[0], + parts[2], + parts[1], + parts[5], + parts[3], + parts[4], + ] + ) + ) diff --git a/lib/feedparser/datetimes/greek.py b/lib/feedparser/datetimes/greek.py index 7f433fed..a6a54da7 100644 --- a/lib/feedparser/datetimes/greek.py +++ b/lib/feedparser/datetimes/greek.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -31,38 +31,40 @@ from .rfc822 import _parse_date_rfc822 # Unicode strings for Greek date strings _greek_months = { - '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 - '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 - '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 - '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 - '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 - '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 - '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 - '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 - '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 - '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 - '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 - '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 - '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 - '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 - '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 - '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 - '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 - '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 - '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 + "\u0399\u03b1\u03bd": "Jan", # c9e1ed in iso-8859-7 + "\u03a6\u03b5\u03b2": "Feb", # d6e5e2 in iso-8859-7 + "\u039c\u03ac\u03ce": "Mar", # ccdcfe in iso-8859-7 + "\u039c\u03b1\u03ce": "Mar", # cce1fe in iso-8859-7 + "\u0391\u03c0\u03c1": "Apr", # c1f0f1 in iso-8859-7 + "\u039c\u03ac\u03b9": "May", # ccdce9 in iso-8859-7 + "\u039c\u03b1\u03ca": "May", # cce1fa in iso-8859-7 + "\u039c\u03b1\u03b9": "May", # cce1e9 in iso-8859-7 + "\u0399\u03bf\u03cd\u03bd": "Jun", # c9effded in iso-8859-7 + "\u0399\u03bf\u03bd": "Jun", # c9efed in iso-8859-7 + "\u0399\u03bf\u03cd\u03bb": "Jul", # c9effdeb in iso-8859-7 + "\u0399\u03bf\u03bb": "Jul", # c9f9eb in iso-8859-7 + "\u0391\u03cd\u03b3": "Aug", # c1fde3 in iso-8859-7 + "\u0391\u03c5\u03b3": "Aug", # c1f5e3 in iso-8859-7 + "\u03a3\u03b5\u03c0": "Sep", # d3e5f0 in iso-8859-7 + "\u039f\u03ba\u03c4": "Oct", # cfeaf4 in iso-8859-7 + "\u039d\u03bf\u03ad": "Nov", # cdefdd in iso-8859-7 + "\u039d\u03bf\u03b5": "Nov", # cdefe5 in iso-8859-7 + "\u0394\u03b5\u03ba": "Dec", # c4e5ea in iso-8859-7 } _greek_wdays = { - '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 - '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 - '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 - '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 - '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 - '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 - '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 + "\u039a\u03c5\u03c1": "Sun", # caf5f1 in iso-8859-7 + "\u0394\u03b5\u03c5": "Mon", # c4e5f5 in iso-8859-7 + "\u03a4\u03c1\u03b9": "Tue", # d4f1e9 in iso-8859-7 + "\u03a4\u03b5\u03c4": "Wed", # d4e5f4 in iso-8859-7 + "\u03a0\u03b5\u03bc": "Thu", # d0e5ec in iso-8859-7 + "\u03a0\u03b1\u03c1": "Fri", # d0e1f1 in iso-8859-7 + "\u03a3\u03b1\u03b2": "Sat", # d3e1e2 in iso-8859-7 } -_greek_date_format_re = re.compile(r'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') +_greek_date_format_re = re.compile( + r"([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)" +) def _parse_date_greek(date_string): @@ -72,15 +74,17 @@ def _parse_date_greek(date_string): return wday = _greek_wdays[m.group(1)] month = _greek_months[m.group(3)] - rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ - { - 'wday': wday, - 'day': m.group(2), - 'month': month, - 'year': m.group(4), - 'hour': m.group(5), - 'minute': m.group(6), - 'second': m.group(7), - 'zonediff': m.group(8), - } + rfc822date = ( + "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(offset)s" + % { + "wday": wday, + "day": m.group(2), + "month": month, + "year": m.group(4), + "hour": m.group(5), + "minute": m.group(6), + "second": m.group(7), + "offset": m.group(8), + } + ) return _parse_date_rfc822(rfc822date) diff --git a/lib/feedparser/datetimes/hungarian.py b/lib/feedparser/datetimes/hungarian.py index 691a6ebc..b4fa8436 100644 --- a/lib/feedparser/datetimes/hungarian.py +++ b/lib/feedparser/datetimes/hungarian.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -31,21 +31,23 @@ from .w3dtf import _parse_date_w3dtf # Unicode strings for Hungarian date strings _hungarian_months = { - 'janu\u00e1r': '01', # e1 in iso-8859-2 - 'febru\u00e1ri': '02', # e1 in iso-8859-2 - 'm\u00e1rcius': '03', # e1 in iso-8859-2 - '\u00e1prilis': '04', # e1 in iso-8859-2 - 'm\u00e1ujus': '05', # e1 in iso-8859-2 - 'j\u00fanius': '06', # fa in iso-8859-2 - 'j\u00falius': '07', # fa in iso-8859-2 - 'augusztus': '08', - 'szeptember': '09', - 'okt\u00f3ber': '10', # f3 in iso-8859-2 - 'november': '11', - 'december': '12', + "janu\u00e1r": "01", # e1 in iso-8859-2 + "febru\u00e1ri": "02", # e1 in iso-8859-2 + "m\u00e1rcius": "03", # e1 in iso-8859-2 + "\u00e1prilis": "04", # e1 in iso-8859-2 + "m\u00e1ujus": "05", # e1 in iso-8859-2 + "j\u00fanius": "06", # fa in iso-8859-2 + "j\u00falius": "07", # fa in iso-8859-2 + "augusztus": "08", + "szeptember": "09", + "okt\u00f3ber": "10", # f3 in iso-8859-2 + "november": "11", + "december": "12", } -_hungarian_date_format_re = re.compile(r'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})([+-](\d{,2}:\d{2}))') +_hungarian_date_format_re = re.compile( + r"(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})([+-](\d{,2}:\d{2}))" +) def _parse_date_hungarian(date_string): @@ -56,17 +58,9 @@ def _parse_date_hungarian(date_string): month = _hungarian_months[m.group(2)] day = m.group(3) if len(day) == 1: - day = '0' + day + day = "0" + day hour = m.group(4) if len(hour) == 1: - hour = '0' + hour - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ - { - 'year': m.group(1), - 'month': month, - 'day': day, - 'hour': hour, - 'minute': m.group(5), - 'zonediff': m.group(6), - } + hour = "0" + hour + w3dtfdate = f"{m.group(1)}-{month}-{day}T{hour}:{m.group(5)}{m.group(6)}" return _parse_date_w3dtf(w3dtfdate) diff --git a/lib/feedparser/datetimes/iso8601.py b/lib/feedparser/datetimes/iso8601.py index 3d3b3f96..8130cc9b 100644 --- a/lib/feedparser/datetimes/iso8601.py +++ b/lib/feedparser/datetimes/iso8601.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -38,36 +38,36 @@ import time # Please note the order in templates is significant because we need a # greedy match. _iso8601_tmpl = [ - 'YYYY-?MM-?DD', - 'YYYY-0MM?-?DD', - 'YYYY-MM', - 'YYYY-?OOO', - 'YY-?MM-?DD', - 'YY-?OOO', - 'YYYY', - '-YY-?MM', - '-OOO', - '-YY', - '--MM-?DD', - '--MM', - '---DD', - 'CC', - '', + "YYYY-?MM-?DD", + "YYYY-0MM?-?DD", + "YYYY-MM", + "YYYY-?OOO", + "YY-?MM-?DD", + "YY-?OOO", + "YYYY", + "-YY-?MM", + "-OOO", + "-YY", + "--MM-?DD", + "--MM", + "---DD", + "CC", + "", ] _iso8601_re = [ - tmpl.replace( - 'YYYY', r'(?P\d{4})').replace( - 'YY', r'(?P\d\d)').replace( - 'MM', r'(?P[01]\d)').replace( - 'DD', r'(?P[0123]\d)').replace( - 'OOO', r'(?P[0123]\d\d)').replace( - 'CC', r'(?P\d\d$)') - + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}))?' - + r'(\.(?P\d+))?' - + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' - for tmpl in _iso8601_tmpl] + tmpl.replace("YYYY", r"(?P\d{4})") + .replace("YY", r"(?P\d\d)") + .replace("MM", r"(?P[01]\d)") + .replace("DD", r"(?P[0123]\d)") + .replace("OOO", r"(?P[0123]\d\d)") + .replace("CC", r"(?P\d\d$)") + + r"(T?(?P\d{2}):(?P\d{2})" + + r"(:(?P\d{2}))?" + + r"(\.(?P\d+))?" + + r"(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?" + for tmpl in _iso8601_tmpl +] _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] @@ -83,21 +83,21 @@ def _parse_date_iso8601(date_string): if m.span() == (0, 0): return params = m.groupdict() - ordinal = params.get('ordinal', 0) + ordinal = params.get("ordinal", 0) if ordinal: ordinal = int(ordinal) else: ordinal = 0 - year = params.get('year', '--') - if not year or year == '--': + year = params.get("year", "--") + if not year or year == "--": year = time.gmtime()[0] elif len(year) == 2: # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 year = 100 * int(time.gmtime()[0] / 100) + int(year) else: year = int(year) - month = params.get('month', '-') - if not month or month == '-': + month = params.get("month", "-") + if not month or month == "-": # ordinals are NOT normalized by mktime, we simulate them # by setting month=1, day=ordinal if ordinal: @@ -105,13 +105,14 @@ def _parse_date_iso8601(date_string): else: month = time.gmtime()[1] month = int(month) - day = params.get('day', 0) + day = params.get("day", 0) if not day: # see above if ordinal: day = ordinal - elif params.get('century', 0) or \ - params.get('year', 0) or params.get('month', 0): + elif ( + params.get("century", 0) or params.get("year", 0) or params.get("month", 0) + ): day = 1 else: day = time.gmtime()[2] @@ -119,29 +120,38 @@ def _parse_date_iso8601(date_string): day = int(day) # special case of the century - is the first year of the 21st century # 2000 or 2001 ? The debate goes on... - if 'century' in params: - year = (int(params['century']) - 1) * 100 + 1 + if "century" in params: + year = (int(params["century"]) - 1) * 100 + 1 # in ISO 8601 most fields are optional - for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: + for field in ["hour", "minute", "second", "tzhour", "tzmin"]: if not params.get(field, None): params[field] = 0 - hour = int(params.get('hour', 0)) - minute = int(params.get('minute', 0)) - second = int(float(params.get('second', 0))) + hour = int(params.get("hour", 0)) + minute = int(params.get("minute", 0)) + second = int(float(params.get("second", 0))) # weekday is normalized by mktime(), we can ignore it weekday = 0 daylight_savings_flag = -1 - tm = [year, month, day, hour, minute, second, weekday, - ordinal, daylight_savings_flag] + tm = [ + year, + month, + day, + hour, + minute, + second, + weekday, + ordinal, + daylight_savings_flag, + ] # ISO 8601 time zone adjustments - tz = params.get('tz') - if tz and tz != 'Z': - if tz[0] == '-': - tm[3] += int(params.get('tzhour', 0)) - tm[4] += int(params.get('tzmin', 0)) - elif tz[0] == '+': - tm[3] -= int(params.get('tzhour', 0)) - tm[4] -= int(params.get('tzmin', 0)) + tz = params.get("tz") + if tz and tz != "Z": + if tz[0] == "-": + tm[3] += int(params.get("tzhour", 0)) + tm[4] += int(params.get("tzmin", 0)) + elif tz[0] == "+": + tm[3] -= int(params.get("tzhour", 0)) + tm[4] -= int(params.get("tzmin", 0)) else: return None # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) diff --git a/lib/feedparser/datetimes/korean.py b/lib/feedparser/datetimes/korean.py index 788d4666..4c5494f2 100644 --- a/lib/feedparser/datetimes/korean.py +++ b/lib/feedparser/datetimes/korean.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -30,20 +30,21 @@ import re from .w3dtf import _parse_date_w3dtf # 8-bit date handling routines written by ytrewq1. -_korean_year = '\ub144' # b3e2 in euc-kr -_korean_month = '\uc6d4' # bff9 in euc-kr -_korean_day = '\uc77c' # c0cf in euc-kr -_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr +_korean_year = "\ub144" # b3e2 in euc-kr +_korean_month = "\uc6d4" # bff9 in euc-kr +_korean_day = "\uc77c" # c0cf in euc-kr +_korean_am = "\uc624\uc804" # bfc0 c0fc in euc-kr +_korean_pm = "\uc624\ud6c4" # bfc0 c8c4 in euc-kr _korean_onblog_date_re = re.compile( - r'(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' + r"(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})" % (_korean_year, _korean_month, _korean_day) ) _korean_nate_date_re = re.compile( - r'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' - % (_korean_am, _korean_pm)) + r"(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})" + % (_korean_am, _korean_pm) +) def _parse_date_onblog(dateString): @@ -51,10 +52,18 @@ def _parse_date_onblog(dateString): m = _korean_onblog_date_re.match(dateString) if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), - 'zonediff': '+09:00'} + w3dtfdate = ( + "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" + % { + "year": m.group(1), + "month": m.group(2), + "day": m.group(3), + "hour": m.group(4), + "minute": m.group(5), + "second": m.group(6), + "zonediff": "+09:00", + } + ) return _parse_date_w3dtf(w3dtfdate) @@ -69,15 +78,17 @@ def _parse_date_nate(dateString): hour += 12 hour = str(hour) if len(hour) == 1: - hour = '0' + hour - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - { - 'year': m.group(1), - 'month': m.group(2), - 'day': m.group(3), - 'hour': hour, - 'minute': m.group(6), - 'second': m.group(7), - 'zonediff': '+09:00', - } + hour = "0" + hour + w3dtfdate = ( + "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" + % { + "year": m.group(1), + "month": m.group(2), + "day": m.group(3), + "hour": hour, + "minute": m.group(6), + "second": m.group(7), + "zonediff": "+09:00", + } + ) return _parse_date_w3dtf(w3dtfdate) diff --git a/lib/feedparser/datetimes/perforce.py b/lib/feedparser/datetimes/perforce.py index d62d722f..8d75eb5b 100644 --- a/lib/feedparser/datetimes/perforce.py +++ b/lib/feedparser/datetimes/perforce.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -33,14 +33,31 @@ import time def _parse_date_perforce(date_string): """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" # Fri, 2006/09/15 08:19:53 EDT - _my_date_pattern = re.compile(r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') + _my_date_pattern = re.compile( + r"(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})" + ) m = _my_date_pattern.search(date_string) if m is None: return None dow, year, month, day, hour, minute, second, tz = m.groups() - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] + new_date_string = ( + f"{dow}, {day} {months[int(month) - 1]} {year} {hour}:{minute}:{second} {tz}" + ) tm = email.utils.parsedate_tz(new_date_string) if tm: return time.gmtime(email.utils.mktime_tz(tm)) diff --git a/lib/feedparser/datetimes/rfc822.py b/lib/feedparser/datetimes/rfc822.py index 871e18fd..5f300f1d 100644 --- a/lib/feedparser/datetimes/rfc822.py +++ b/lib/feedparser/datetimes/rfc822.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -28,20 +28,45 @@ import datetime timezone_names = { - 'ut': 0, 'gmt': 0, 'z': 0, - 'adt': -3, 'ast': -4, 'at': -4, - 'edt': -4, 'est': -5, 'et': -5, - 'cdt': -5, 'cst': -6, 'ct': -6, - 'mdt': -6, 'mst': -7, 'mt': -7, - 'pdt': -7, 'pst': -8, 'pt': -8, - 'a': -1, 'n': 1, - 'm': -12, 'y': 12, - 'met': 1, 'mest': 2, + "ut": 0, + "gmt": 0, + "z": 0, + "adt": -3, + "ast": -4, + "at": -4, + "edt": -4, + "est": -5, + "et": -5, + "cdt": -5, + "cst": -6, + "ct": -6, + "mdt": -6, + "mst": -7, + "mt": -7, + "pdt": -7, + "pst": -8, + "pt": -8, + "a": -1, + "n": 1, + "m": -12, + "y": 12, + "met": 1, + "mest": 2, } -day_names = {'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'} +day_names = {"mon", "tue", "wed", "thu", "fri", "sat", "sun"} months = { - 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, - 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, + "jan": 1, + "feb": 2, + "mar": 3, + "apr": 4, + "may": 5, + "jun": 6, + "jul": 7, + "aug": 8, + "sep": 9, + "oct": 10, + "nov": 11, + "dec": 12, } @@ -63,7 +88,7 @@ def _parse_date_rfc822(date): parts = date.lower().split() if len(parts) < 5: # Assume that the time and timezone are missing - parts.extend(('00:00:00', '0000')) + parts.extend(("00:00:00", "0000")) # Remove the day name if parts[0][:3] in day_names: parts = parts[1:] @@ -101,26 +126,26 @@ def _parse_date_rfc822(date): year += (1900, 2000)[year < 90] # Handle the time (default to 00:00:00). - time_parts = parts[3].split(':') - time_parts.extend(('0',) * (3 - len(time_parts))) + time_parts = parts[3].split(":") + time_parts.extend(("0",) * (3 - len(time_parts))) try: - (hour, minute, second) = [int(i) for i in time_parts] + (hour, minute, second) = (int(i) for i in time_parts) except ValueError: return None # Handle the timezone information, if any (default to +0000). # Strip 'Etc/' from the timezone. - if parts[4].startswith('etc/'): + if parts[4].startswith("etc/"): parts[4] = parts[4][4:] # Normalize timezones that start with 'gmt': # GMT-05:00 => -0500 # GMT => GMT - if parts[4].startswith('gmt'): - parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' + if parts[4].startswith("gmt"): + parts[4] = "".join(parts[4][3:].split(":")) or "gmt" # Handle timezones like '-0500', '+0500', and 'EST' - if parts[4] and parts[4][0] in ('-', '+'): + if parts[4] and parts[4][0] in ("-", "+"): try: - if ':' in parts[4]: + if ":" in parts[4]: timezone_hours = int(parts[4][1:3]) timezone_minutes = int(parts[4][4:]) else: @@ -128,7 +153,7 @@ def _parse_date_rfc822(date): timezone_minutes = int(parts[4][3:]) except ValueError: return None - if parts[4].startswith('-'): + if parts[4].startswith("-"): timezone_hours *= -1 timezone_minutes *= -1 else: diff --git a/lib/feedparser/datetimes/w3dtf.py b/lib/feedparser/datetimes/w3dtf.py index 6fb2c545..977005ca 100644 --- a/lib/feedparser/datetimes/w3dtf.py +++ b/lib/feedparser/datetimes/w3dtf.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -28,14 +28,28 @@ import datetime timezonenames = { - 'ut': 0, 'gmt': 0, 'z': 0, - 'adt': -3, 'ast': -4, 'at': -4, - 'edt': -4, 'est': -5, 'et': -5, - 'cdt': -5, 'cst': -6, 'ct': -6, - 'mdt': -6, 'mst': -7, 'mt': -7, - 'pdt': -7, 'pst': -8, 'pt': -8, - 'a': -1, 'n': 1, - 'm': -12, 'y': 12, + "ut": 0, + "gmt": 0, + "z": 0, + "adt": -3, + "ast": -4, + "at": -4, + "edt": -4, + "est": -5, + "et": -5, + "cdt": -5, + "cst": -6, + "ct": -6, + "mdt": -6, + "mst": -7, + "mt": -7, + "pdt": -7, + "pst": -8, + "pt": -8, + "a": -1, + "n": 1, + "m": -12, + "y": 12, } # W3 date and time format parser # http://www.w3.org/TR/NOTE-datetime @@ -47,57 +61,57 @@ timezonenames = { def _parse_date_w3dtf(datestr): if not datestr.strip(): return None - parts = datestr.lower().split('t') + parts = datestr.lower().split("t") if len(parts) == 1: # This may be a date only, or may be an MSSQL-style date parts = parts[0].split() if len(parts) == 1: # Treat this as a date only - parts.append('00:00:00z') + parts.append("00:00:00z") elif len(parts) > 2: return None - date = parts[0].split('-', 2) + date = parts[0].split("-", 2) if not date or len(date[0]) != 4: return None # Ensure that `date` has 3 elements. Using '1' sets the default # month to January and the default day to the 1st of the month. - date.extend(['1'] * (3 - len(date))) + date.extend(["1"] * (3 - len(date))) try: - year, month, day = [int(i) for i in date] + year, month, day = (int(i) for i in date) except ValueError: # `date` may have more than 3 elements or may contain # non-integer strings. return None - if parts[1].endswith('z'): + if parts[1].endswith("z"): parts[1] = parts[1][:-1] - parts.append('z') + parts.append("z") # Append the numeric timezone offset, if any, to parts. # If this is an MSSQL-style date then parts[2] already contains # the timezone information, so `append()` will not affect it. # Add 1 to each value so that if `find()` returns -1 it will be # treated as False. - loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 + loc = parts[1].find("-") + 1 or parts[1].find("+") + 1 or len(parts[1]) + 1 loc = loc - 1 parts.append(parts[1][loc:]) parts[1] = parts[1][:loc] - time = parts[1].split(':', 2) + time = parts[1].split(":", 2) # Ensure that time has 3 elements. Using '0' means that the # minutes and seconds, if missing, will default to 0. - time.extend(['0'] * (3 - len(time))) - if parts[2][:1] in ('-', '+'): + time.extend(["0"] * (3 - len(time))) + if parts[2][:1] in ("-", "+"): try: tzhour = int(parts[2][1:3]) tzmin = int(parts[2][4:]) except ValueError: return None - if parts[2].startswith('-'): + if parts[2].startswith("-"): tzhour = tzhour * -1 tzmin = tzmin * -1 else: tzhour = timezonenames.get(parts[2], 0) tzmin = 0 try: - hour, minute, second = [int(float(i)) for i in time] + hour, minute, second = (int(float(i)) for i in time) except ValueError: return None # Create the datetime object and timezone delta objects diff --git a/lib/feedparser/encodings.py b/lib/feedparser/encodings.py index 73251fc1..a7be68ae 100644 --- a/lib/feedparser/encodings.py +++ b/lib/feedparser/encodings.py @@ -1,5 +1,5 @@ # Character encoding routines -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,48 +26,53 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations + import codecs +import io import re -import typing as t +import typing try: try: - import cchardet as chardet # type: ignore[import] + import cchardet as chardet # type: ignore[import] except ImportError: - import chardet # type: ignore[no-redef] + import chardet # type: ignore[no-redef] except ImportError: lazy_chardet_encoding = None else: + def lazy_chardet_encoding(data): - return chardet.detect(data)['encoding'] or '' + return chardet.detect(data)["encoding"] or "" + from .exceptions import ( CharacterEncodingOverride, CharacterEncodingUnknown, + FeedparserError, NonXMLContentType, ) - # Each marker represents some of the characters of the opening XML # processing instruction (' -RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>') +RE_XML_DECLARATION = re.compile(r"^<\?xml[^>]*?>") # Capture the value of the XML processing instruction's encoding attribute. # Example: -RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') +RE_XML_PI_ENCODING = re.compile(rb'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') -def parse_content_type(line: str) -> t.Tuple[str, str]: +def parse_content_type(line: str) -> tuple[str, str]: """Parse an HTTP Content-Type header. The return value will be a tuple of strings: @@ -91,11 +96,10 @@ def parse_content_type(line: str) -> t.Tuple[str, str]: return mime_type, charset_value -def convert_to_utf8(http_headers, data, result): - """Detect and convert the character encoding to UTF-8. - - http_headers is a dictionary - data is a raw string (not Unicode)""" +def convert_to_utf8( + http_headers: dict[str, str], data: bytes, result: dict[str, typing.Any] +) -> bytes: + """Detect and convert the character encoding to UTF-8.""" # This is so much trickier than it sounds, it's not even funny. # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type @@ -134,12 +138,10 @@ def convert_to_utf8(http_headers, data, result): # Of course, none of this guarantees that we will be able to parse the # feed in the declared character encoding (assuming it was declared - # correctly, which many are not). iconv_codec can help a lot; - # you should definitely install it if you can. - # http://cjkpython.i18n.org/ + # correctly, which many are not). - bom_encoding = '' - xml_encoding = '' + bom_encoding = "" + xml_encoding = "" # Look at the first few bytes of the document to guess what # its encoding may be. We only need to decode enough of the @@ -149,50 +151,63 @@ def convert_to_utf8(http_headers, data, result): # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info # Check for BOMs first. if data[:4] == codecs.BOM_UTF32_BE: - bom_encoding = 'utf-32be' + bom_encoding = "utf-32be" data = data[4:] elif data[:4] == codecs.BOM_UTF32_LE: - bom_encoding = 'utf-32le' + bom_encoding = "utf-32le" data = data[4:] elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: - bom_encoding = 'utf-16be' + bom_encoding = "utf-16be" data = data[2:] elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: - bom_encoding = 'utf-16le' + bom_encoding = "utf-16le" data = data[2:] elif data[:3] == codecs.BOM_UTF8: - bom_encoding = 'utf-8' + bom_encoding = "utf-8" data = data[3:] # Check for the characters '''' - if RE_XML_DECLARATION.search(data): - data = RE_XML_DECLARATION.sub(new_declaration, data) - else: - data = new_declaration + '\n' + data - data = data.encode('utf-8') - break + continue + + known_encoding = True + if not json: + # Update the encoding in the opening XML processing instruction. + new_declaration = """""" + if RE_XML_DECLARATION.search(text): + text = RE_XML_DECLARATION.sub(new_declaration, text) + else: + text = new_declaration + "\n" + text + data = text.encode("utf-8") + break + # if still no luck, give up if not known_encoding: error = CharacterEncodingUnknown( - 'document encoding unknown, I tried ' + - '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % - (rfc3023_encoding, xml_encoding)) - rfc3023_encoding = '' + "document encoding unknown, I tried " + + "%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked" + % (rfc3023_encoding, xml_encoding) + ) + rfc3023_encoding = "" elif proposed_encoding != rfc3023_encoding: error = CharacterEncodingOverride( - 'document declared as %s, but parsed as %s' % - (rfc3023_encoding, proposed_encoding)) + "document declared as %s, but parsed as %s" + % (rfc3023_encoding, proposed_encoding) + ) rfc3023_encoding = proposed_encoding - result['content-type'] = http_content_type # for selecting the parser - result['encoding'] = rfc3023_encoding + result["content-type"] = http_content_type # for selecting the parser + result["encoding"] = rfc3023_encoding if error: - result['bozo'] = True - result['bozo_exception'] = error + result["bozo"] = True + result["bozo_exception"] = error return data + + +# How much to read from a binary file in order to detect encoding. +# In inital tests, 4k was enough for ~160 mostly-English feeds; +# 64k seems like a safe margin. +CONVERT_FILE_PREFIX_LEN = 2**16 + +# How much to read from a text file, and use as an utf-8 bytes prefix. +# Note that no encoding detection is needed in this case. +CONVERT_FILE_STR_PREFIX_LEN = 2**13 + +CONVERT_FILE_TEST_CHUNK_LEN = 2**16 + + +def convert_file_to_utf8( + http_headers, file, result, optimistic_encoding_detection=True +): + """Like convert_to_utf8(), but for a stream. + + Unlike convert_to_utf8(), do not read the entire file in memory; + instead, return a text stream that decodes it on the fly. + This should consume significantly less memory, + because it avoids (repeatedly) converting the entire file contents + from bytes to str and back. + + To detect the encoding, only a prefix of the file contents is used. + In rare cases, the wrong encoding may be detected for this prefix; + use optimistic_encoding_detection=False to use the entire file contents + (equivalent to a plain convert_to_utf8() call). + + Args: + http_headers (dict): The response headers. + file (IO[bytes] or IO[str]): A read()-able (binary) stream. + result (dict): The result dictionary. + optimistic_encoding_detection (bool): + If true, use only a prefix of the file content to detect encoding. + + Returns: + StreamFactory: a stream factory, with the detected encoding set, if any + + """ + # Currently, this wraps convert_to_utf8(), because the logic is simply + # too complicated to ensure it's re-implemented correctly for a stream. + # That said, it should be possible to change the implementation + # transparently (not sure it's worth it, though). + + # If file is a text stream, we don't need to detect encoding; + # we still need a bytes prefix to run functions on for side effects: + # convert_to_utf8() to sniff / set result['content-type'], and + # replace_doctype() to extract safe_entities. + + if isinstance(file.read(0), str): + prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode("utf-8") + prefix = convert_to_utf8(http_headers, prefix, result) + result["encoding"] = "utf-8" + return StreamFactory(prefix, file, "utf-8") + + if optimistic_encoding_detection: + prefix = convert_file_prefix_to_utf8(http_headers, file, result) + factory = StreamFactory(prefix, file, result.get("encoding")) + + # Before returning factory, ensure the entire file can be decoded; + # if it cannot, fall back to convert_to_utf8(). + # + # Not doing this means feedparser.parse() may raise UnicodeDecodeError + # instead of setting bozo_exception to CharacterEncodingOverride, + # breaking the 6.x API. + + try: + text_file = factory.get_text_file() + except MissingEncoding: + return factory + try: + # read in chunks to limit memory usage + while text_file.read(CONVERT_FILE_TEST_CHUNK_LEN): + pass + except UnicodeDecodeError: + # fall back to convert_to_utf8() + file = factory.get_binary_file() + else: + return factory + + # this shouldn't increase memory usage if file is BytesIO, + # since BytesIO does copy-on-write; https://bugs.python.org/issue22003 + data = convert_to_utf8(http_headers, file.read(), result) + + # note that data *is* the prefix + return StreamFactory(data, io.BytesIO(b""), result.get("encoding")) + + +def convert_file_prefix_to_utf8( + http_headers, + file: typing.IO[bytes], + result, + *, + prefix_len: int = CONVERT_FILE_PREFIX_LEN, + read_to_ascii_len: int = 2**8, +) -> bytes: + """Like convert_to_utf8(), but only use the prefix of a binary file. + + Set result like convert_to_utf8() would. + + Return the updated prefix, as bytes. + + """ + # This is complicated by convert_to_utf8() detecting the wrong encoding + # if we have only part of the bytes that make a code-point: + # + # '😀'.encode('utf-8') -> utf-8 + # '😀'.encode('utf-8')[:-1] -> windows-1252 + bozo + + prefix = file.read(prefix_len - 1) + + # reading up to after an ASCII byte increases + # the likelihood of being on a code point boundary + prefix += read_to_after_ascii_byte(file, read_to_ascii_len) + + # call convert_to_utf8() up to 4 times, + # to make sure we eventually land on a code point boundary + candidates = [] + for attempt in range(4): + byte = file.read(1) + + # we're at the end of the file, and the loop already ran once + if not byte and attempt != 0: + break + + prefix += byte + + fake_result: typing.Any = {} + converted_prefix = convert_to_utf8(http_headers, prefix, fake_result) + + # an encoding was detected successfully, keep it + if not fake_result.get("bozo"): + break + + candidates.append((file.tell(), converted_prefix, fake_result)) + + # no encoding was detected successfully, pick the "best" one + else: + + def key(candidate): + *_, result = candidate + + exc = result.get("bozo_exception") + exc_score = 0 + if isinstance(exc, NonXMLContentType): + exc_score = 20 + elif isinstance(exc, CharacterEncodingOverride): + exc_score = 10 + + return ( + exc_score, + # prefer utf- encodings to anything else + result.get("encoding").startswith("utf-"), + ) + + candidates.sort(key=key) + offset, converted_prefix, fake_result = candidates[-1] + + file.seek(offset) + + result.update(fake_result) + return converted_prefix + + +def read_to_after_ascii_byte(file: typing.IO[bytes], max_len: int) -> bytes: + offset = file.tell() + buffer = b"" + + for _ in range(max_len): + byte = file.read(1) + + # end of file, nothing to do + if not byte: + break + + buffer += byte + + # we stop after a ASCII character + if byte < b"\x80": + break + + # couldn't find an ASCII character, reset the file to the original offset + else: + file.seek(offset) + return b"" + + return buffer + + +class MissingEncoding(io.UnsupportedOperation): + pass + + +class StreamFactory: + + """Decode on the fly a binary stream that *may* have a known encoding. + + If the underlying stream is seekable, it is possible to call + the get_{text,binary}_file() methods more than once. + + """ + + def __init__(self, prefix: bytes, file, encoding=None): + self.prefix = prefix + self.file = ResetFileWrapper(file) + self.encoding = encoding + self.should_reset = False + + def get_text_file(self, fallback_encoding=None, errors="strict"): + encoding = self.encoding or fallback_encoding + if encoding is None: + raise MissingEncoding("cannot create text stream without encoding") + + if isinstance(self.file.read(0), str): + file = PrefixFileWrapper(self.prefix.decode(encoding), self.file) + else: + file = PrefixFileWrapper( + self.prefix.decode("utf-8", errors), + codecs.getreader(encoding)(self.file, errors), + ) + + self.reset() + return file + + def get_binary_file(self): + if isinstance(self.file.read(0), str): + raise io.UnsupportedOperation( + "underlying stream is text, not binary" + ) from None + + file = PrefixFileWrapper(self.prefix, self.file) + + self.reset() + return file + + def get_file(self): + try: + return self.get_text_file() + except MissingEncoding: + return self.get_binary_file() + + def reset(self): + if self.should_reset: + self.file.reset() + self.should_reset = True + + +class ResetFileWrapper: + """Given a seekable file, allow reading its content again + (from the current position) by calling reset(). + + """ + + def __init__(self, file): + self.file = file + try: + self.file_initial_offset = file.tell() + except OSError: + self.file_initial_offset = None + + def read(self, size=-1): + return self.file.read(size) + + def reset(self): + # raises io.UnsupportedOperation if the underlying stream is not seekable + self.file.seek(self.file_initial_offset) + + +class PrefixFileWrapper: + """Stitch a (possibly modified) prefix and a file into a new file object. + + >>> file = io.StringIO('abcdef') + >>> file.read(2) + 'ab' + >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file) + >>> wrapped.read() + 'CDef' + + """ + + def __init__(self, prefix, file): + self.prefix = prefix + self.file = file + self.offset = 0 + + def read(self, size=-1): + buffer = self.file.read(0) + + if self.offset < len(self.prefix): + if size < 0: + chunk = self.prefix + else: + chunk = self.prefix[self.offset : self.offset + size] + size -= len(chunk) + buffer += chunk + self.offset += len(chunk) + + while True: + chunk = self.file.read(size) + if not chunk: + break + buffer += chunk + self.offset += len(chunk) + + if size <= 0: + break + + size -= len(chunk) + + return buffer + + def close(self): + # do not touch the underlying stream + pass diff --git a/lib/feedparser/exceptions.py b/lib/feedparser/exceptions.py index 0ddb0024..49ca2858 100644 --- a/lib/feedparser/exceptions.py +++ b/lib/feedparser/exceptions.py @@ -1,5 +1,5 @@ # Exceptions used throughout feedparser -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,11 +27,11 @@ # POSSIBILITY OF SUCH DAMAGE. __all__ = [ - 'FeedparserError', - 'CharacterEncodingOverride', - 'CharacterEncodingUnknown', - 'NonXMLContentType', - 'UndeclaredNamespace', + "FeedparserError", + "CharacterEncodingOverride", + "CharacterEncodingUnknown", + "NonXMLContentType", + "UndeclaredNamespace", ] diff --git a/lib/feedparser/html.py b/lib/feedparser/html.py index 48ddb924..bbb90389 100644 --- a/lib/feedparser/html.py +++ b/lib/feedparser/html.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -28,36 +28,49 @@ import html.entities import re -import sgmllib3k as sgmllib +# These items must all be imported into this module due to .__code__ replacements. +from .sgml import ( # noqa: F401 + attrfind, + charref, + endbracket, + entityref, + incomplete, + interesting, + sgmllib, + shorttag, + shorttagopen, + starttagopen, + tagfind, +) _cp1252 = { - 128: '\u20ac', # euro sign - 130: '\u201a', # single low-9 quotation mark - 131: '\u0192', # latin small letter f with hook - 132: '\u201e', # double low-9 quotation mark - 133: '\u2026', # horizontal ellipsis - 134: '\u2020', # dagger - 135: '\u2021', # double dagger - 136: '\u02c6', # modifier letter circumflex accent - 137: '\u2030', # per mille sign - 138: '\u0160', # latin capital letter s with caron - 139: '\u2039', # single left-pointing angle quotation mark - 140: '\u0152', # latin capital ligature oe - 142: '\u017d', # latin capital letter z with caron - 145: '\u2018', # left single quotation mark - 146: '\u2019', # right single quotation mark - 147: '\u201c', # left double quotation mark - 148: '\u201d', # right double quotation mark - 149: '\u2022', # bullet - 150: '\u2013', # en dash - 151: '\u2014', # em dash - 152: '\u02dc', # small tilde - 153: '\u2122', # trade mark sign - 154: '\u0161', # latin small letter s with caron - 155: '\u203a', # single right-pointing angle quotation mark - 156: '\u0153', # latin small ligature oe - 158: '\u017e', # latin small letter z with caron - 159: '\u0178', # latin capital letter y with diaeresis + 128: "\u20ac", # euro sign + 130: "\u201a", # single low-9 quotation mark + 131: "\u0192", # latin small letter f with hook + 132: "\u201e", # double low-9 quotation mark + 133: "\u2026", # horizontal ellipsis + 134: "\u2020", # dagger + 135: "\u2021", # double dagger + 136: "\u02c6", # modifier letter circumflex accent + 137: "\u2030", # per mille sign + 138: "\u0160", # latin capital letter s with caron + 139: "\u2039", # single left-pointing angle quotation mark + 140: "\u0152", # latin capital ligature oe + 142: "\u017d", # latin capital letter z with caron + 145: "\u2018", # left single quotation mark + 146: "\u2019", # right single quotation mark + 147: "\u201c", # left double quotation mark + 148: "\u201d", # right double quotation mark + 149: "\u2022", # bullet + 150: "\u2013", # en dash + 151: "\u2014", # em dash + 152: "\u02dc", # small tilde + 153: "\u2122", # trade mark sign + 154: "\u0161", # latin small letter s with caron + 155: "\u203a", # single right-pointing angle quotation mark + 156: "\u0153", # latin small ligature oe + 158: "\u017e", # latin small letter z with caron + 159: "\u0178", # latin capital letter y with diaeresis } @@ -65,28 +78,28 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): special = re.compile("""[<>'"]""") bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") elements_no_end_tag = { - 'area', - 'base', - 'basefont', - 'br', - 'col', - 'command', - 'embed', - 'frame', - 'hr', - 'img', - 'input', - 'isindex', - 'keygen', - 'link', - 'meta', - 'param', - 'source', - 'track', - 'wbr', + "area", + "base", + "basefont", + "br", + "col", + "command", + "embed", + "frame", + "hr", + "img", + "input", + "isindex", + "keygen", + "link", + "meta", + "param", + "source", + "track", + "wbr", } - def __init__(self, encoding=None, _type='application/xhtml+xml'): + def __init__(self, encoding=None, _type="application/xhtml+xml"): if encoding: self.encoding = encoding self._type = _type @@ -105,9 +118,9 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): tag = match.group(1) if tag in self.elements_no_end_tag: - return '<' + tag + ' />' + return "<" + tag + " />" else: - return '<' + tag + '>' + return "<" + tag + ">" # By declaring these methods and overriding their compiled code # with the code from sgmllib, the original code will execute in @@ -128,8 +141,8 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): def parse_starttag(self, i): j = self.__parse_starttag(i) - if self._type == 'application/xhtml+xml': - if j > 2 and self.rawdata[j-2:j] == '/>': + if self._type == "application/xhtml+xml": + if j > 2 and self.rawdata[j - 2 : j] == "/>": self.unknown_endtag(self.lasttag) return j @@ -139,10 +152,10 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): :rtype: None """ - data = re.sub(r'\s]+?)\s*/>', self._shorttag_replace, data) - data = data.replace(''', "'") - data = data.replace('"', '"') + data = re.sub(r"\s]+?)\s*/>", self._shorttag_replace, data) + data = data.replace("'", "'") + data = data.replace(""", '"') super().feed(data) super().close() @@ -160,8 +173,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): # *attrs* into a dictionary, then convert it back to a list. attrs_d = {k.lower(): v for k, v in attrs} attrs = [ - (k, k in ('rel', 'type') and v.lower() or v) - for k, v in attrs_d.items() + (k, k in ("rel", "type") and v.lower() or v) for k, v in attrs_d.items() ] attrs.sort() return attrs @@ -177,22 +189,19 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
         uattrs = []
-        strattrs = ''
+        strattrs = ""
         if attrs:
             for key, value in attrs:
-                value = value.replace('>', '>')
-                value = value.replace('<', '<')
-                value = value.replace('"', '"')
+                value = value.replace(">", ">")
+                value = value.replace("<", "<")
+                value = value.replace('"', """)
                 value = self.bare_ampersand.sub("&", value)
                 uattrs.append((key, value))
-            strattrs = ''.join(
-                ' %s="%s"' % (key, value)
-                for key, value in uattrs
-            )
+            strattrs = "".join(f' {key}="{value}"' for key, value in uattrs)
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%s%s />' % (tag, strattrs))
+            self.pieces.append(f"<{tag}{strattrs} />")
         else:
-            self.pieces.append('<%s%s>' % (tag, strattrs))
+            self.pieces.append(f"<{tag}{strattrs}>")
 
     def unknown_endtag(self, tag):
         """
@@ -214,15 +223,15 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
         # Called for each character reference, e.g. ' ' will extract '160'
         # Reconstruct the original character reference.
         ref = ref.lower()
-        if ref.startswith('x'):
+        if ref.startswith("x"):
             value = int(ref[1:], 16)
         else:
             value = int(ref)
 
         if value in _cp1252:
-            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
+            self.pieces.append("&#%s;" % hex(ord(_cp1252[value]))[1:])
         else:
-            self.pieces.append('&#%s;' % ref)
+            self.pieces.append("&#%s;" % ref)
 
     def handle_entityref(self, ref):
         """
@@ -232,10 +241,10 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
 
         # Called for each entity reference, e.g. '©' will extract 'copy'
         # Reconstruct the original entity reference.
-        if ref in html.entities.name2codepoint or ref == 'apos':
-            self.pieces.append('&%s;' % ref)
+        if ref in html.entities.name2codepoint or ref == "apos":
+            self.pieces.append("&%s;" % ref)
         else:
-            self.pieces.append('&%s' % ref)
+            self.pieces.append("&%s" % ref)
 
     def handle_data(self, text):
         """
@@ -256,7 +265,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
 
         # Called for HTML comments, e.g. 
         # Reconstruct the original comment.
-        self.pieces.append('' % text)
+        self.pieces.append("" % text)
 
     def handle_pi(self, text):
         """
@@ -266,7 +275,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
 
         # Called for each processing instruction, e.g. 
         # Reconstruct original processing instruction.
-        self.pieces.append('' % text)
+        self.pieces.append("" % text)
 
     def handle_decl(self, text):
         """
@@ -278,9 +287,9 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
         # 
         # Reconstruct original DOCTYPE
-        self.pieces.append('' % text)
+        self.pieces.append("" % text)
 
-    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
+    _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match
 
     def _scan_name(self, i, declstartpos):
         """
@@ -311,7 +320,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
         :rtype: str
         """
 
-        return '&#%s;' % name
+        return "&#%s;" % name
 
     def convert_entityref(self, name):
         """
@@ -319,7 +328,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
         :rtype: str
         """
 
-        return '&%s;' % name
+        return "&%s;" % name
 
     def output(self):
         """Return processed HTML as a single string.
@@ -327,7 +336,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
         :rtype: str
         """
 
-        return ''.join(self.pieces)
+        return "".join(self.pieces)
 
     def parse_declaration(self, i):
         """
@@ -339,5 +348,5 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
             return sgmllib.SGMLParser.parse_declaration(self, i)
         except (AssertionError, sgmllib.SGMLParseError):
             # Escape the doctype declaration and continue parsing.
-            self.handle_data('<')
-            return i+1
+            self.handle_data("<")
+            return i + 1
diff --git a/lib/feedparser/http.py b/lib/feedparser/http.py
index a7fee361..d94ade2e 100644
--- a/lib/feedparser/http.py
+++ b/lib/feedparser/http.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee 
+# Copyright 2010-2023 Kurt McKee 
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@@ -25,203 +25,54 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-import base64
-import datetime
-import gzip
-import io
-import re
-import struct
-import urllib.parse
-import urllib.request
-import zlib
+from __future__ import annotations
+
+import typing
+
+import requests
 
 from .datetimes import _parse_date
-from .urls import convert_to_idn
+
+# HTTP "Accept" header to send to servers when downloading feeds.
+ACCEPT_HEADER: str = (
+    "application/atom+xml"
+    ",application/rdf+xml"
+    ",application/rss+xml"
+    ",application/x-netcdf"
+    ",application/xml"
+    ";q=0.9,text/xml"
+    ";q=0.2,*/*"
+    ";q=0.1"
+)
 
 
-# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
-# want to send an Accept header, set this to None.
-ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
+def get(url: str, result: dict[str, typing.Any]) -> bytes:
+    from . import USER_AGENT
 
+    agent = USER_AGENT
 
-class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
-    def http_error_default(self, req, fp, code, msg, headers):
-        # The default implementation just raises HTTPError.
-        # Forget that.
-        fp.status = code
-        return fp
+    try:
+        response = requests.get(
+            url,
+            headers={"User-Agent": agent, "Accept": ACCEPT_HEADER},
+            timeout=10,
+        )
+    except requests.RequestException as exception:
+        result["bozo"] = True
+        result["bozo_exception"] = exception
+        return b""
 
-    def http_error_301(self, req, fp, code, msg, hdrs):
-        result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
-        if not result:
-            return fp
-        result.status = code
-        result.newurl = result.geturl()
-        return result
-
-    # The default implementations in urllib.request.HTTPRedirectHandler
-    # are identical, so hardcoding a http_error_301 call above
-    # won't affect anything
-    http_error_300 = http_error_301
-    http_error_302 = http_error_301
-    http_error_303 = http_error_301
-    http_error_307 = http_error_301
-
-    def http_error_401(self, req, fp, code, msg, headers):
-        # Check if
-        # - server requires digest auth, AND
-        # - we tried (unsuccessfully) with basic auth, AND
-        # If all conditions hold, parse authentication information
-        # out of the Authorization header we sent the first time
-        # (for the username and password) and the WWW-Authenticate
-        # header the server sent back (for the realm) and retry
-        # the request with the appropriate digest auth headers instead.
-        # This evil genius hack has been brought to you by Aaron Swartz.
-        host = urllib.parse.urlparse(req.get_full_url())[1]
-        if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
-            return self.http_error_default(req, fp, code, msg, headers)
-        auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
-        user, passw = auth.split(':')
-        realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
-        self.add_password(realm, host, user, passw)
-        retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
-        self.reset_retry_count()
-        return retry
-
-
-def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers):
-    request = urllib.request.Request(url)
-    request.add_header('User-Agent', agent)
-    if etag:
-        request.add_header('If-None-Match', etag)
-    if isinstance(modified, str):
-        modified = _parse_date(modified)
-    elif isinstance(modified, datetime.datetime):
-        modified = modified.utctimetuple()
-    if modified:
-        # format into an RFC 1123-compliant timestamp. We can't use
-        # time.strftime() since the %a and %b directives can be affected
-        # by the current locale, but RFC 2616 states that dates must be
-        # in English.
-        short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
-        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
-        request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
-    if referrer:
-        request.add_header('Referer', referrer)
-    request.add_header('Accept-encoding', 'gzip, deflate')
-    if auth:
-        request.add_header('Authorization', 'Basic %s' % auth)
-    if accept_header:
-        request.add_header('Accept', accept_header)
-    # use this for whatever -- cookies, special headers, etc
-    # [('Cookie','Something'),('x-special-header','Another Value')]
-    for header_name, header_value in request_headers.items():
-        request.add_header(header_name, header_value)
-    request.add_header('A-IM', 'feed')  # RFC 3229 support
-    return request
-
-
-def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None):
-    if handlers is None:
-        handlers = []
-    elif not isinstance(handlers, list):
-        handlers = [handlers]
-    if request_headers is None:
-        request_headers = {}
-
-    # Deal with the feed URI scheme
-    if url.startswith('feed:http'):
-        url = url[5:]
-    elif url.startswith('feed:'):
-        url = 'http:' + url[5:]
-    if not agent:
-        from . import USER_AGENT
-        agent = USER_AGENT
-    # Test for inline user:password credentials for HTTP basic auth
-    auth = None
-    if not url.startswith('ftp:'):
-        url_pieces = urllib.parse.urlparse(url)
-        if url_pieces.username:
-            new_pieces = list(url_pieces)
-            new_pieces[1] = url_pieces.hostname
-            if url_pieces.port:
-                new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
-            url = urllib.parse.urlunparse(new_pieces)
-            auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
-
-    # iri support
-    if not isinstance(url, bytes):
-        url = convert_to_idn(url)
-
-    # Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
-    bits = []
-    for c in url:
-        try:
-            c.encode('ascii')
-        except UnicodeEncodeError:
-            bits.append(urllib.parse.quote(c))
-        else:
-            bits.append(c)
-    url = ''.join(bits)
-
-    # try to open with urllib2 (to use optional headers)
-    request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
-    opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
-    opener.addheaders = []  # RMK - must clear so we only send our custom User-Agent
-    f = opener.open(request)
-    data = f.read()
-    f.close()
-
-    # lowercase all of the HTTP headers for comparisons per RFC 2616
-    result['headers'] = {k.lower(): v for k, v in f.headers.items()}
-
-    # if feed is gzip-compressed, decompress it
-    if data and 'gzip' in result['headers'].get('content-encoding', ''):
-        try:
-            data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
-        except (EOFError, IOError, struct.error) as e:
-            # IOError can occur if the gzip header is bad.
-            # struct.error can occur if the data is damaged.
-            result['bozo'] = True
-            result['bozo_exception'] = e
-            if isinstance(e, struct.error):
-                # A gzip header was found but the data is corrupt.
-                # Ideally, we should re-request the feed without the
-                # 'Accept-encoding: gzip' header, but we don't.
-                data = None
-    elif data and 'deflate' in result['headers'].get('content-encoding', ''):
-        try:
-            data = zlib.decompress(data)
-        except zlib.error:
-            try:
-                # The data may have no headers and no checksum.
-                data = zlib.decompress(data, -15)
-            except zlib.error as e:
-                result['bozo'] = True
-                result['bozo_exception'] = e
+    # Lowercase the HTTP header keys for comparisons per RFC 2616.
+    result["headers"] = {k.lower(): v for k, v in response.headers.items()}
 
     # save HTTP headers
-    if 'etag' in result['headers']:
-        etag = result['headers'].get('etag', '')
-        if isinstance(etag, bytes):
-            etag = etag.decode('utf-8', 'ignore')
-        if etag:
-            result['etag'] = etag
-    if 'last-modified' in result['headers']:
-        modified = result['headers'].get('last-modified', '')
+    if "etag" in result["headers"]:
+        result["etag"] = result["headers"]["etag"]
+    if "last-modified" in result["headers"]:
+        modified = result["headers"]["last-modified"]
         if modified:
-            result['modified'] = modified
-            result['modified_parsed'] = _parse_date(modified)
-    if isinstance(f.url, bytes):
-        result['href'] = f.url.decode('utf-8', 'ignore')
-    else:
-        result['href'] = f.url
-    result['status'] = getattr(f, 'status', None) or 200
-
-    # Stop processing if the server sent HTTP 304 Not Modified.
-    if getattr(f, 'code', 0) == 304:
-        result['version'] = ''
-        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
-            'so the server sent no data.  This is a feature, not a bug!'
-
-    return data
+            result["modified"] = modified
+            result["modified_parsed"] = _parse_date(modified)
+    result["href"] = response.url
+    result["status"] = response.status_code
+    return response.content
diff --git a/lib/feedparser/mixin.py b/lib/feedparser/mixin.py
index 8309e723..4c6d4e9a 100644
--- a/lib/feedparser/mixin.py
+++ b/lib/feedparser/mixin.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee 
+# Copyright 2010-2023 Kurt McKee 
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@@ -30,137 +30,144 @@ import binascii
 import copy
 import html.entities
 import re
-from typing import Dict
 import xml.sax.saxutils
+from typing import Dict
 
 from .html import _cp1252
 from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
-from .sanitizer import sanitize_html, HTMLSanitizer
-from .util import FeedParserDict
+from .sanitizer import HTMLSanitizer, sanitize_html
 from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
+from .util import FeedParserDict
+
+email_pattern = re.compile(
+    r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)"
+    r"|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))"
+    r"(\?subject=\S+)?"
+)
 
 
 class XMLParserMixin(
-        _base.Namespace,
-        cc.Namespace,
-        dc.Namespace,
-        georss.Namespace,
-        itunes.Namespace,
-        mediarss.Namespace,
-        psc.Namespace,
+    _base.Namespace,
+    cc.Namespace,
+    dc.Namespace,
+    georss.Namespace,
+    itunes.Namespace,
+    mediarss.Namespace,
+    psc.Namespace,
 ):
     namespaces = {
-        '': '',
-        'http://backend.userland.com/rss': '',
-        'http://blogs.law.harvard.edu/tech/rss': '',
-        'http://purl.org/rss/1.0/': '',
-        'http://my.netscape.com/rdf/simple/0.9/': '',
-        'http://example.com/newformat#': '',
-        'http://example.com/necho': '',
-        'http://purl.org/echo/': '',
-        'uri/of/echo/namespace#': '',
-        'http://purl.org/pie/': '',
-        'http://purl.org/atom/ns#': '',
-        'http://www.w3.org/2005/Atom': '',
-        'http://purl.org/rss/1.0/modules/rss091#': '',
-
-        'http://webns.net/mvcb/':                                'admin',
-        'http://purl.org/rss/1.0/modules/aggregation/':          'ag',
-        'http://purl.org/rss/1.0/modules/annotate/':             'annotate',
-        'http://media.tangent.org/rss/1.0/':                     'audio',
-        'http://backend.userland.com/blogChannelModule':         'blogChannel',
-        'http://creativecommons.org/ns#license':                 'cc',
-        'http://web.resource.org/cc/':                           'cc',
-        'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativeCommons',
-        'http://backend.userland.com/creativeCommonsRssModule':  'creativeCommons',
-        'http://purl.org/rss/1.0/modules/company':               'co',
-        'http://purl.org/rss/1.0/modules/content/':              'content',
-        'http://my.theinfo.org/changed/1.0/rss/':                'cp',
-        'http://purl.org/dc/elements/1.1/':                      'dc',
-        'http://purl.org/dc/terms/':                             'dcterms',
-        'http://purl.org/rss/1.0/modules/email/':                'email',
-        'http://purl.org/rss/1.0/modules/event/':                'ev',
-        'http://rssnamespace.org/feedburner/ext/1.0':            'feedburner',
-        'http://freshmeat.net/rss/fm/':                          'fm',
-        'http://xmlns.com/foaf/0.1/':                            'foaf',
-        'http://www.w3.org/2003/01/geo/wgs84_pos#':              'geo',
-        'http://www.georss.org/georss':                          'georss',
-        'http://www.opengis.net/gml':                            'gml',
-        'http://postneo.com/icbm/':                              'icbm',
-        'http://purl.org/rss/1.0/modules/image/':                'image',
-        'http://www.itunes.com/DTDs/PodCast-1.0.dtd':            'itunes',
-        'http://example.com/DTDs/PodCast-1.0.dtd':               'itunes',
-        'http://purl.org/rss/1.0/modules/link/':                 'l',
-        'http://search.yahoo.com/mrss':                          'media',
+        "": "",
+        "http://backend.userland.com/rss": "",
+        "http://blogs.law.harvard.edu/tech/rss": "",
+        "http://purl.org/rss/1.0/": "",
+        "http://my.netscape.com/rdf/simple/0.9/": "",
+        "http://example.com/newformat#": "",
+        "http://example.com/necho": "",
+        "http://purl.org/echo/": "",
+        "uri/of/echo/namespace#": "",
+        "http://purl.org/pie/": "",
+        "http://purl.org/atom/ns#": "",
+        "http://www.w3.org/2005/Atom": "",
+        "http://purl.org/rss/1.0/modules/rss091#": "",
+        "http://webns.net/mvcb/": "admin",
+        "http://purl.org/rss/1.0/modules/aggregation/": "ag",
+        "http://purl.org/rss/1.0/modules/annotate/": "annotate",
+        "http://media.tangent.org/rss/1.0/": "audio",
+        "http://backend.userland.com/blogChannelModule": "blogChannel",
+        "http://creativecommons.org/ns#license": "cc",
+        "http://web.resource.org/cc/": "cc",
+        "http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": (
+            "creativeCommons"
+        ),
+        "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
+        "http://purl.org/rss/1.0/modules/company": "co",
+        "http://purl.org/rss/1.0/modules/content/": "content",
+        "http://my.theinfo.org/changed/1.0/rss/": "cp",
+        "http://purl.org/dc/elements/1.1/": "dc",
+        "http://purl.org/dc/terms/": "dcterms",
+        "http://purl.org/rss/1.0/modules/email/": "email",
+        "http://purl.org/rss/1.0/modules/event/": "ev",
+        "http://rssnamespace.org/feedburner/ext/1.0": "feedburner",
+        "http://freshmeat.net/rss/fm/": "fm",
+        "http://xmlns.com/foaf/0.1/": "foaf",
+        "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo",
+        "http://www.georss.org/georss": "georss",
+        "http://www.opengis.net/gml": "gml",
+        "http://postneo.com/icbm/": "icbm",
+        "http://purl.org/rss/1.0/modules/image/": "image",
+        "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes",
+        "http://example.com/DTDs/PodCast-1.0.dtd": "itunes",
+        "http://purl.org/rss/1.0/modules/link/": "l",
+        "http://search.yahoo.com/mrss": "media",
         # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
-        'http://search.yahoo.com/mrss/':                         'media',
-        'http://madskills.com/public/xml/rss/module/pingback/':  'pingback',
-        'http://prismstandard.org/namespaces/1.2/basic/':        'prism',
-        'http://www.w3.org/1999/02/22-rdf-syntax-ns#':           'rdf',
-        'http://www.w3.org/2000/01/rdf-schema#':                 'rdfs',
-        'http://purl.org/rss/1.0/modules/reference/':            'ref',
-        'http://purl.org/rss/1.0/modules/richequiv/':            'reqv',
-        'http://purl.org/rss/1.0/modules/search/':               'search',
-        'http://purl.org/rss/1.0/modules/slash/':                'slash',
-        'http://schemas.xmlsoap.org/soap/envelope/':             'soap',
-        'http://purl.org/rss/1.0/modules/servicestatus/':        'ss',
-        'http://hacks.benhammersley.com/rss/streaming/':         'str',
-        'http://purl.org/rss/1.0/modules/subscription/':         'sub',
-        'http://purl.org/rss/1.0/modules/syndication/':          'sy',
-        'http://schemas.pocketsoap.com/rss/myDescModule/':       'szf',
-        'http://purl.org/rss/1.0/modules/taxonomy/':             'taxo',
-        'http://purl.org/rss/1.0/modules/threading/':            'thr',
-        'http://purl.org/rss/1.0/modules/textinput/':            'ti',
-        'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
-        'http://wellformedweb.org/commentAPI/':                  'wfw',
-        'http://purl.org/rss/1.0/modules/wiki/':                 'wiki',
-        'http://www.w3.org/1999/xhtml':                          'xhtml',
-        'http://www.w3.org/1999/xlink':                          'xlink',
-        'http://www.w3.org/XML/1998/namespace':                  'xml',
-        'http://podlove.org/simple-chapters':                    'psc',
+        "http://search.yahoo.com/mrss/": "media",
+        "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
+        "http://prismstandard.org/namespaces/1.2/basic/": "prism",
+        "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
+        "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
+        "http://purl.org/rss/1.0/modules/reference/": "ref",
+        "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
+        "http://purl.org/rss/1.0/modules/search/": "search",
+        "http://purl.org/rss/1.0/modules/slash/": "slash",
+        "http://schemas.xmlsoap.org/soap/envelope/": "soap",
+        "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
+        "http://hacks.benhammersley.com/rss/streaming/": "str",
+        "http://purl.org/rss/1.0/modules/subscription/": "sub",
+        "http://purl.org/rss/1.0/modules/syndication/": "sy",
+        "http://schemas.pocketsoap.com/rss/myDescModule/": "szf",
+        "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
+        "http://purl.org/rss/1.0/modules/threading/": "thr",
+        "http://purl.org/rss/1.0/modules/textinput/": "ti",
+        "http://madskills.com/public/xml/rss/module/trackback/": "trackback",
+        "http://wellformedweb.org/commentAPI/": "wfw",
+        "http://purl.org/rss/1.0/modules/wiki/": "wiki",
+        "http://www.w3.org/1999/xhtml": "xhtml",
+        "http://www.w3.org/1999/xlink": "xlink",
+        "http://www.w3.org/XML/1998/namespace": "xml",
+        "http://podlove.org/simple-chapters": "psc",
     }
     _matchnamespaces: Dict[str, str] = {}
 
     can_be_relative_uri = {
-        'comments',
-        'docs',
-        'href',
-        'icon',
-        'id',
-        'link',
-        'logo',
-        'url',
-        'wfw_comment',
-        'wfw_commentrss',
+        "comments",
+        "docs",
+        "href",
+        "icon",
+        "id",
+        "link",
+        "logo",
+        "url",
+        "wfw_comment",
+        "wfw_commentrss",
     }
 
     can_contain_relative_uris = {
-        'content',
-        'copyright',
-        'description',
-        'info',
-        'rights',
-        'subtitle',
-        'summary',
-        'tagline',
-        'title',
+        "content",
+        "copyright",
+        "description",
+        "info",
+        "rights",
+        "subtitle",
+        "summary",
+        "tagline",
+        "title",
     }
 
     can_contain_dangerous_markup = {
-        'content',
-        'copyright',
-        'description',
-        'info',
-        'rights',
-        'subtitle',
-        'summary',
-        'tagline',
-        'title',
+        "content",
+        "copyright",
+        "description",
+        "info",
+        "rights",
+        "subtitle",
+        "summary",
+        "tagline",
+        "title",
     }
 
     html_types = {
-        'application/xhtml+xml',
-        'text/html',
+        "application/xhtml+xml",
+        "text/html",
     }
 
     def __init__(self):
@@ -169,7 +176,7 @@ class XMLParserMixin(
                 self._matchnamespaces[k.lower()] = v
         self.feeddata = FeedParserDict()  # feed-level data
         self.entries = []  # list of entry-level data
-        self.version = ''  # feed type/version, see SUPPORTED_VERSIONS
+        self.version = ""  # feed type/version, see SUPPORTED_VERSIONS
         self.namespaces_in_use = {}  # dictionary of namespaces defined by the feed
         self.resolve_relative_uris = False
         self.sanitize_html = False
@@ -198,7 +205,7 @@ class XMLParserMixin(
         self.depth = 0
         self.hasContent = 0
         if self.lang:
-            self.feeddata['language'] = self.lang.replace('_', '-')
+            self.feeddata["language"] = self.lang.replace("_", "-")
 
         # A map of the following form:
         #     {
@@ -208,7 +215,7 @@ class XMLParserMixin(
         #         },
         #     }
         self.property_depth_map = {}
-        super(XMLParserMixin, self).__init__()
+        super().__init__()
 
     def _normalize_attributes(self, kv):
         raise NotImplementedError
@@ -222,72 +229,80 @@ class XMLParserMixin(
 
         # track xml:base and xml:lang
         attrs_d = dict(attrs)
-        baseuri = attrs_d.get('xml:base', attrs_d.get('base')) or self.baseuri
+        baseuri = attrs_d.get("xml:base", attrs_d.get("base")) or self.baseuri
         if isinstance(baseuri, bytes):
-            baseuri = baseuri.decode(self.encoding, 'ignore')
+            baseuri = baseuri.decode(self.encoding, "ignore")
         # ensure that self.baseuri is always an absolute URI that
         # uses a whitelisted URI scheme (e.g. not `javscript:`)
         if self.baseuri:
             self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
         else:
             self.baseuri = _urljoin(self.baseuri, baseuri)
-        lang = attrs_d.get('xml:lang', attrs_d.get('lang'))
-        if lang == '':
+        lang = attrs_d.get("xml:lang", attrs_d.get("lang"))
+        if lang == "":
             # xml:lang could be explicitly set to '', we need to capture that
             lang = None
         elif lang is None:
             # if no xml:lang is specified, use parent lang
             lang = self.lang
         if lang:
-            if tag in ('feed', 'rss', 'rdf:RDF'):
-                self.feeddata['language'] = lang.replace('_', '-')
+            if tag in ("feed", "rss", "rdf:RDF"):
+                self.feeddata["language"] = lang.replace("_", "-")
         self.lang = lang
         self.basestack.append(self.baseuri)
         self.langstack.append(lang)
 
         # track namespaces
         for prefix, uri in attrs:
-            if prefix.startswith('xmlns:'):
+            if prefix.startswith("xmlns:"):
                 self.track_namespace(prefix[6:], uri)
-            elif prefix == 'xmlns':
+            elif prefix == "xmlns":
                 self.track_namespace(None, uri)
 
         # track inline content
-        if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'):
-            if tag in ('xhtml:div', 'div'):
+        if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
+            if tag in ("xhtml:div", "div"):
                 return  # typepad does this 10/2007
             # element declared itself as escaped markup, but it isn't really
-            self.contentparams['type'] = 'application/xhtml+xml'
-        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
-            if tag.find(':') != -1:
-                prefix, tag = tag.split(':', 1)
-                namespace = self.namespaces_in_use.get(prefix, '')
-                if tag == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML':
-                    attrs.append(('xmlns', namespace))
-                if tag == 'svg' and namespace == 'http://www.w3.org/2000/svg':
-                    attrs.append(('xmlns', namespace))
-            if tag == 'svg':
+            self.contentparams["type"] = "application/xhtml+xml"
+        if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
+            if tag.find(":") != -1:
+                prefix, tag = tag.split(":", 1)
+                namespace = self.namespaces_in_use.get(prefix, "")
+                if tag == "math" and namespace == "http://www.w3.org/1998/Math/MathML":
+                    attrs.append(("xmlns", namespace))
+                if tag == "svg" and namespace == "http://www.w3.org/2000/svg":
+                    attrs.append(("xmlns", namespace))
+            if tag == "svg":
                 self.svgOK += 1
-            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
+            return self.handle_data(f"<{tag}{self.strattrs(attrs)}>", escape=0)
 
         # match namespaces
-        if tag.find(':') != -1:
-            prefix, suffix = tag.split(':', 1)
+        if tag.find(":") != -1:
+            prefix, suffix = tag.split(":", 1)
         else:
-            prefix, suffix = '', tag
+            prefix, suffix = "", tag
         prefix = self.namespacemap.get(prefix, prefix)
         if prefix:
-            prefix = prefix + '_'
+            prefix = prefix + "_"
 
         # Special hack for better tracking of empty textinput/image elements in
         # illformed feeds.
-        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
+        if (not prefix) and tag not in ("title", "link", "description", "name"):
             self.intextinput = 0
-        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
+        if (not prefix) and tag not in (
+            "title",
+            "link",
+            "description",
+            "url",
+            "href",
+            "width",
+            "height",
+        ):
             self.inimage = 0
 
         # call special handler (if defined) or default handler
-        methodname = '_start_' + prefix + suffix
+        methodname = "_start_" + prefix + suffix
         try:
             method = getattr(self, methodname)
             return method(attrs_d)
@@ -305,18 +320,18 @@ class XMLParserMixin(
 
     def unknown_endtag(self, tag):
         # match namespaces
-        if tag.find(':') != -1:
-            prefix, suffix = tag.split(':', 1)
+        if tag.find(":") != -1:
+            prefix, suffix = tag.split(":", 1)
         else:
-            prefix, suffix = '', tag
+            prefix, suffix = "", tag
         prefix = self.namespacemap.get(prefix, prefix)
         if prefix:
-            prefix = prefix + '_'
-        if suffix == 'svg' and self.svgOK:
+            prefix = prefix + "_"
+        if suffix == "svg" and self.svgOK:
             self.svgOK -= 1
 
         # call special handler (if defined) or default handler
-        methodname = '_end_' + prefix + suffix
+        methodname = "_end_" + prefix + suffix
         try:
             if self.svgOK:
                 raise AttributeError()
@@ -326,14 +341,14 @@ class XMLParserMixin(
             self.pop(prefix + suffix)
 
         # track inline content
-        if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'):
+        if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
             # element declared itself as escaped markup, but it isn't really
-            if tag in ('xhtml:div', 'div'):
+            if tag in ("xhtml:div", "div"):
                 return  # typepad does this 10/2007
-            self.contentparams['type'] = 'application/xhtml+xml'
-        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
-            tag = tag.split(':')[-1]
-            self.handle_data('' % tag, escape=0)
+            self.contentparams["type"] = "application/xhtml+xml"
+        if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
+            tag = tag.split(":")[-1]
+            self.handle_data("" % tag, escape=0)
 
         # track xml:base and xml:lang going out of scope
         if self.basestack:
@@ -352,33 +367,33 @@ class XMLParserMixin(
         if not self.elementstack:
             return
         ref = ref.lower()
-        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
-            text = '&#%s;' % ref
+        if ref in ("34", "38", "39", "60", "62", "x22", "x26", "x27", "x3c", "x3e"):
+            text = "&#%s;" % ref
         else:
-            if ref[0] == 'x':
+            if ref[0] == "x":
                 c = int(ref[1:], 16)
             else:
                 c = int(ref)
-            text = chr(c).encode('utf-8')
+            text = chr(c).encode("utf-8")
         self.elementstack[-1][2].append(text)
 
     def handle_entityref(self, ref):
         # Called for each entity reference, e.g. for '©', ref is 'copy'
         if not self.elementstack:
             return
-        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
-            text = '&%s;' % ref
+        if ref in ("lt", "gt", "quot", "amp", "apos"):
+            text = "&%s;" % ref
         elif ref in self.entities:
             text = self.entities[ref]
-            if text.startswith('&#') and text.endswith(';'):
+            if text.startswith("&#") and text.endswith(";"):
                 return self.handle_entityref(text)
         else:
             try:
                 html.entities.name2codepoint[ref]
             except KeyError:
-                text = '&%s;' % ref
+                text = "&%s;" % ref
             else:
-                text = chr(html.entities.name2codepoint[ref]).encode('utf-8')
+                text = chr(html.entities.name2codepoint[ref]).encode("utf-8")
         self.elementstack[-1][2].append(text)
 
     def handle_data(self, text, escape=1):
@@ -386,7 +401,7 @@ class XMLParserMixin(
         # not containing any character or entity references
         if not self.elementstack:
             return
-        if escape and self.contentparams.get('type') == 'application/xhtml+xml':
+        if escape and self.contentparams.get("type") == "application/xhtml+xml":
             text = xml.sax.saxutils.escape(text)
         self.elementstack[-1][2].append(text)
 
@@ -403,18 +418,18 @@ class XMLParserMixin(
 
     def parse_declaration(self, i):
         # Override internal declaration handler to handle CDATA blocks.
-        if self.rawdata[i:i+9] == '', i)
+        if self.rawdata[i : i + 9] == "", i)
             if k == -1:
                 # CDATA block began but didn't finish
                 k = len(self.rawdata)
                 return k
-            self.handle_data(xml.sax.saxutils.escape(self.rawdata[i+9:k]), 0)
-            return k+3
+            self.handle_data(xml.sax.saxutils.escape(self.rawdata[i + 9 : k]), 0)
+            return k + 3
         else:
-            k = self.rawdata.find('>', i)
+            k = self.rawdata.find(">", i)
             if k >= 0:
-                return k+1
+                return k + 1
             else:
                 # We have an incomplete CDATA block.
                 return k
@@ -422,35 +437,35 @@ class XMLParserMixin(
     @staticmethod
     def map_content_type(content_type):
         content_type = content_type.lower()
-        if content_type == 'text' or content_type == 'plain':
-            content_type = 'text/plain'
-        elif content_type == 'html':
-            content_type = 'text/html'
-        elif content_type == 'xhtml':
-            content_type = 'application/xhtml+xml'
+        if content_type == "text" or content_type == "plain":
+            content_type = "text/plain"
+        elif content_type == "html":
+            content_type = "text/html"
+        elif content_type == "xhtml":
+            content_type = "application/xhtml+xml"
         return content_type
 
     def track_namespace(self, prefix, uri):
         loweruri = uri.lower()
         if not self.version:
-            if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
-                self.version = 'rss090'
-            elif loweruri == 'http://purl.org/rss/1.0/':
-                self.version = 'rss10'
-            elif loweruri == 'http://www.w3.org/2005/atom':
-                self.version = 'atom10'
-        if loweruri.find('backend.userland.com/rss') != -1:
+            if (prefix, loweruri) == (None, "http://my.netscape.com/rdf/simple/0.9/"):
+                self.version = "rss090"
+            elif loweruri == "http://purl.org/rss/1.0/":
+                self.version = "rss10"
+            elif loweruri == "http://www.w3.org/2005/atom":
+                self.version = "atom10"
+        if loweruri.find("backend.userland.com/rss") != -1:
             # match any backend.userland.com namespace
-            uri = 'http://backend.userland.com/rss'
+            uri = "http://backend.userland.com/rss"
             loweruri = uri
         if loweruri in self._matchnamespaces:
             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
             self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri
         else:
-            self.namespaces_in_use[prefix or ''] = uri
+            self.namespaces_in_use[prefix or ""] = uri
 
     def resolve_uri(self, uri):
-        return _urljoin(self.baseuri or '', uri)
+        return _urljoin(self.baseuri or "", uri)
 
     @staticmethod
     def decode_entities(element, data):
@@ -458,8 +473,8 @@ class XMLParserMixin(
 
     @staticmethod
     def strattrs(attrs):
-        return ''.join(
-            ' %s="%s"' % (t[0], xml.sax.saxutils.escape(t[1], {'"': '"'}))
+        return "".join(
+            ' {}="{}"'.format(t[0], xml.sax.saxutils.escape(t[1], {'"': """}))
             for t in attrs
         )
 
@@ -475,11 +490,14 @@ class XMLParserMixin(
         element, expecting_text, pieces = self.elementstack.pop()
 
         # Ensure each piece is a str for Python 3
-        for (i, v) in enumerate(pieces):
+        for i, v in enumerate(pieces):
             if isinstance(v, bytes):
-                pieces[i] = v.decode('utf-8')
+                pieces[i] = v.decode("utf-8")
 
-        if self.version == 'atom10' and self.contentparams.get('type', 'text') == 'application/xhtml+xml':
+        if (
+            self.version == "atom10"
+            and self.contentparams.get("type", "text") == "application/xhtml+xml"
+        ):
             # remove enclosing child element, but only if it is a 
and # only if all the remaining content is nested underneath it. # This means that the divs would be retained in the following: @@ -488,76 +506,95 @@ class XMLParserMixin( del pieces[-1] while pieces and len(pieces) > 1 and not pieces[0].strip(): del pieces[0] - if pieces and (pieces[0] == '
' or pieces[0].startswith('
': + if ( + pieces + and (pieces[0] == "
" or pieces[0].startswith("
" + ): depth = 0 for piece in pieces[:-1]: - if piece.startswith(''): + elif piece.startswith("<") and not piece.endswith("/>"): depth += 1 else: pieces = pieces[1:-1] - output = ''.join(pieces) + output = "".join(pieces) if strip_whitespace: output = output.strip() if not expecting_text: return output # decode base64 content - if base64 and self.contentparams.get('base64', 0): + if base64 and self.contentparams.get("base64", 0): try: - output = base64.decodebytes(output.encode('utf8')).decode('utf8') + output = base64.decodebytes(output.encode("utf8")).decode("utf8") except (binascii.Error, binascii.Incomplete, UnicodeDecodeError): pass # resolve relative URIs if (element in self.can_be_relative_uri) and output: # do not resolve guid elements with isPermalink="false" - if not element == 'id' or self.guidislink: + if not element == "id" or self.guidislink: output = self.resolve_uri(output) # decode entities within embedded markup - if not self.contentparams.get('base64', 0): + if not self.contentparams.get("base64", 0): output = self.decode_entities(element, output) # some feed formats require consumers to guess # whether the content is html or plain text - if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain': + if ( + not self.version.startswith("atom") + and self.contentparams.get("type") == "text/plain" + ): if self.looks_like_html(output): - self.contentparams['type'] = 'text/html' + self.contentparams["type"] = "text/html" # remove temporary cruft from contentparams try: - del self.contentparams['mode'] + del self.contentparams["mode"] except KeyError: pass try: - del self.contentparams['base64'] + del self.contentparams["base64"] except KeyError: pass - is_htmlish = self.map_content_type(self.contentparams.get('type', 'text/html')) in self.html_types + is_htmlish = ( + self.map_content_type(self.contentparams.get("type", "text/html")) + in self.html_types + ) # resolve relative URIs within embedded markup if is_htmlish and self.resolve_relative_uris: if element in self.can_contain_relative_uris: - output = resolve_relative_uris(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) + output = resolve_relative_uris( + output, + self.baseuri, + self.encoding, + self.contentparams.get("type", "text/html"), + ) # sanitize embedded markup if is_htmlish and self.sanitize_html: if element in self.can_contain_dangerous_markup: - output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html')) + output = sanitize_html( + output, self.encoding, self.contentparams.get("type", "text/html") + ) if self.encoding and isinstance(output, bytes): - output = output.decode(self.encoding, 'ignore') + output = output.decode(self.encoding, "ignore") # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and not isinstance(output, bytes): + if self.encoding in ("utf-8", "utf-8_INVALID_PYTHON_3") and not isinstance( + output, bytes + ): try: - output = output.encode('iso-8859-1').decode('utf-8') + output = output.encode("iso-8859-1").decode("utf-8") except (UnicodeEncodeError, UnicodeDecodeError): pass @@ -567,65 +604,74 @@ class XMLParserMixin( # categories/tags/keywords/whatever are handled in _end_category or # _end_tags or _end_itunes_keywords - if element in ('category', 'tags', 'itunes_keywords'): + if element in ("category", "tags", "itunes_keywords"): return output - if element == 'title' and -1 < self.title_depth <= self.depth: + if element == "title" and -1 < self.title_depth <= self.depth: return output # store output in appropriate place(s) if self.inentry and not self.insource: - if element == 'content': + if element == "content": self.entries[-1].setdefault(element, []) contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output + contentparams["value"] = output self.entries[-1][element].append(contentparams) - elif element == 'link': + elif element == "link": if not self.inimage: # query variables in urls in link elements are improperly # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're # unhandled character references. fix this special case. - output = output.replace('&', '&') + output = output.replace("&", "&") output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) self.entries[-1][element] = output if output: - self.entries[-1]['links'][-1]['href'] = output + self.entries[-1]["links"][-1]["href"] = output else: - if element == 'description': - element = 'summary' - old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) + if element == "description": + element = "summary" + old_value_depth = self.property_depth_map.setdefault( + self.entries[-1], {} + ).get(element) if old_value_depth is None or self.depth <= old_value_depth: self.property_depth_map[self.entries[-1]][element] = self.depth self.entries[-1][element] = output if self.incontent: contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element + '_detail'] = contentparams - elif self.infeed or self.insource: # and (not self.intextinput) and (not self.inimage): + contentparams["value"] = output + self.entries[-1][element + "_detail"] = contentparams + elif ( + self.infeed or self.insource + ): # and (not self.intextinput) and (not self.inimage): context = self._get_context() - if element == 'description': - element = 'subtitle' + if element == "description": + element = "subtitle" context[element] = output - if element == 'link': + if element == "link": # fix query variables; see above for the explanation output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) context[element] = output - context['links'][-1]['href'] = output + context["links"][-1]["href"] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - context[element + '_detail'] = contentparams + contentparams["value"] = output + context[element + "_detail"] = contentparams return output def push_content(self, tag, attrs_d, default_content_type, expecting_text): self.incontent += 1 if self.lang: - self.lang = self.lang.replace('_', '-') - self.contentparams = FeedParserDict({ - 'type': self.map_content_type(attrs_d.get('type', default_content_type)), - 'language': self.lang, - 'base': self.baseuri}) - self.contentparams['base64'] = self._is_base64(attrs_d, self.contentparams) + self.lang = self.lang.replace("_", "-") + self.contentparams = FeedParserDict( + { + "type": self.map_content_type( + attrs_d.get("type", default_content_type) + ), + "language": self.lang, + "base": self.baseuri, + } + ) + self.contentparams["base64"] = self._is_base64(attrs_d, self.contentparams) self.push(tag, expecting_text) def pop_content(self, tag): @@ -646,55 +692,61 @@ class XMLParserMixin( """ # must have a close tag or an entity reference to qualify - if not (re.search(r'', s) or re.search(r'&#?\w+;', s)): + if not (re.search(r"", s) or re.search(r"&#?\w+;", s)): return False # all tags must be in a restricted subset of valid HTML tags - if any((t for t in re.findall(r'', '') - author = author.replace('<>', '') + author = author.replace(email, "") + author = author.replace("()", "") + author = author.replace("<>", "") + author = author.replace("<>", "") author = author.strip() - if author and (author[0] == '('): + if author and (author[0] == "("): author = author[1:] - if author and (author[-1] == ')'): + if author and (author[-1] == ")"): author = author[:-1] author = author.strip() if author or email: - context.setdefault('%s_detail' % key, detail) + context.setdefault("%s_detail" % key, detail) if author: - detail['name'] = author + detail["name"] = author if email: - detail['email'] = email + detail["email"] = email def _add_tag(self, term, scheme, label): context = self._get_context() - tags = context.setdefault('tags', []) + tags = context.setdefault("tags", []) if (not term) and (not scheme) and (not label): return value = FeedParserDict(term=term, scheme=scheme, label=label) @@ -781,8 +833,8 @@ class XMLParserMixin( # This is a completely-made up element. Its semantics are determined # only by a single feed that precipitated bug report 392 on Google Code. # In short, this is junk code. - self.push('tags', 1) + self.push("tags", 1) def _end_tags(self): - for term in self.pop('tags').split(','): + for term in self.pop("tags").split(","): self._add_tag(term.strip(), None, None) diff --git a/lib/feedparser/namespaces/_base.py b/lib/feedparser/namespaces/_base.py index 6478a76c..1fc3ee30 100644 --- a/lib/feedparser/namespaces/_base.py +++ b/lib/feedparser/namespaces/_base.py @@ -1,5 +1,5 @@ # Support for the Atom, RSS, RDF, and CDF feed formats -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -33,7 +33,7 @@ from ..urls import make_safe_absolute_uri from ..util import FeedParserDict -class Namespace(object): +class Namespace: """Support for the Atom, RSS, RDF, and CDF feed formats. The feed formats all share common elements, some of which have conflicting @@ -42,452 +42,490 @@ class Namespace(object): """ supported_namespaces = { - '': '', - 'http://backend.userland.com/rss': '', - 'http://blogs.law.harvard.edu/tech/rss': '', - 'http://purl.org/rss/1.0/': '', - 'http://my.netscape.com/rdf/simple/0.9/': '', - 'http://example.com/newformat#': '', - 'http://example.com/necho': '', - 'http://purl.org/echo/': '', - 'uri/of/echo/namespace#': '', - 'http://purl.org/pie/': '', - 'http://purl.org/atom/ns#': '', - 'http://www.w3.org/2005/Atom': '', - 'http://purl.org/rss/1.0/modules/rss091#': '', + "": "", + "http://backend.userland.com/rss": "", + "http://blogs.law.harvard.edu/tech/rss": "", + "http://purl.org/rss/1.0/": "", + "http://my.netscape.com/rdf/simple/0.9/": "", + "http://example.com/newformat#": "", + "http://example.com/necho": "", + "http://purl.org/echo/": "", + "uri/of/echo/namespace#": "", + "http://purl.org/pie/": "", + "http://purl.org/atom/ns#": "", + "http://www.w3.org/2005/Atom": "", + "http://purl.org/rss/1.0/modules/rss091#": "", } def _start_rss(self, attrs_d): versionmap = { - '0.91': 'rss091u', - '0.92': 'rss092', - '0.93': 'rss093', - '0.94': 'rss094', + "0.91": "rss091u", + "0.92": "rss092", + "0.93": "rss093", + "0.94": "rss094", } # If we're here then this is an RSS feed. # If we don't have a version or have a version that starts with something # other than RSS then there's been a mistake. Correct it. - if not self.version or not self.version.startswith('rss'): - attr_version = attrs_d.get('version', '') + if not self.version or not self.version.startswith("rss"): + attr_version = attrs_d.get("version", "") version = versionmap.get(attr_version) if version: self.version = version - elif attr_version.startswith('2.'): - self.version = 'rss20' + elif attr_version.startswith("2."): + self.version = "rss20" else: - self.version = 'rss' + self.version = "rss" def _start_channel(self, attrs_d): self.infeed = 1 self._cdf_common(attrs_d) def _cdf_common(self, attrs_d): - if 'lastmod' in attrs_d: + if "lastmod" in attrs_d: self._start_modified({}) - self.elementstack[-1][-1] = attrs_d['lastmod'] + self.elementstack[-1][-1] = attrs_d["lastmod"] self._end_modified() - if 'href' in attrs_d: + if "href" in attrs_d: self._start_link({}) - self.elementstack[-1][-1] = attrs_d['href'] + self.elementstack[-1][-1] = attrs_d["href"] self._end_link() def _start_feed(self, attrs_d): self.infeed = 1 - versionmap = {'0.1': 'atom01', - '0.2': 'atom02', - '0.3': 'atom03'} + versionmap = {"0.1": "atom01", "0.2": "atom02", "0.3": "atom03"} if not self.version: - attr_version = attrs_d.get('version') + attr_version = attrs_d.get("version") version = versionmap.get(attr_version) if version: self.version = version else: - self.version = 'atom' + self.version = "atom" def _end_channel(self): self.infeed = 0 + _end_feed = _end_channel def _start_image(self, attrs_d): context = self._get_context() if not self.inentry: - context.setdefault('image', FeedParserDict()) + context.setdefault("image", FeedParserDict()) self.inimage = 1 self.title_depth = -1 - self.push('image', 0) + self.push("image", 0) def _end_image(self): - self.pop('image') + self.pop("image") self.inimage = 0 def _start_textinput(self, attrs_d): context = self._get_context() - context.setdefault('textinput', FeedParserDict()) + context.setdefault("textinput", FeedParserDict()) self.intextinput = 1 self.title_depth = -1 - self.push('textinput', 0) + self.push("textinput", 0) + _start_textInput = _start_textinput def _end_textinput(self): - self.pop('textinput') + self.pop("textinput") self.intextinput = 0 + _end_textInput = _end_textinput def _start_author(self, attrs_d): self.inauthor = 1 - self.push('author', 1) + self.push("author", 1) # Append a new FeedParserDict when expecting an author context = self._get_context() - context.setdefault('authors', []) - context['authors'].append(FeedParserDict()) + context.setdefault("authors", []) + context["authors"].append(FeedParserDict()) + _start_managingeditor = _start_author def _end_author(self): - self.pop('author') + self.pop("author") self.inauthor = 0 self._sync_author_detail() + _end_managingeditor = _end_author def _start_contributor(self, attrs_d): self.incontributor = 1 context = self._get_context() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('contributor', 0) + context.setdefault("contributors", []) + context["contributors"].append(FeedParserDict()) + self.push("contributor", 0) def _end_contributor(self): - self.pop('contributor') + self.pop("contributor") self.incontributor = 0 def _start_name(self, attrs_d): - self.push('name', 0) + self.push("name", 0) def _end_name(self): - value = self.pop('name') + value = self.pop("name") if self.inpublisher: - self._save_author('name', value, 'publisher') + self._save_author("name", value, "publisher") elif self.inauthor: - self._save_author('name', value) + self._save_author("name", value) elif self.incontributor: - self._save_contributor('name', value) + self._save_contributor("name", value) elif self.intextinput: context = self._get_context() - context['name'] = value + context["name"] = value def _start_width(self, attrs_d): - self.push('width', 0) + self.push("width", 0) def _end_width(self): - value = self.pop('width') + value = self.pop("width") try: value = int(value) except ValueError: value = 0 if self.inimage: context = self._get_context() - context['width'] = value + context["width"] = value def _start_height(self, attrs_d): - self.push('height', 0) + self.push("height", 0) def _end_height(self): - value = self.pop('height') + value = self.pop("height") try: value = int(value) except ValueError: value = 0 if self.inimage: context = self._get_context() - context['height'] = value + context["height"] = value def _start_url(self, attrs_d): - self.push('href', 1) + self.push("href", 1) + _start_homepage = _start_url _start_uri = _start_url def _end_url(self): - value = self.pop('href') + value = self.pop("href") if self.inauthor: - self._save_author('href', value) + self._save_author("href", value) elif self.incontributor: - self._save_contributor('href', value) + self._save_contributor("href", value) + _end_homepage = _end_url _end_uri = _end_url def _start_email(self, attrs_d): - self.push('email', 0) + self.push("email", 0) def _end_email(self): - value = self.pop('email') + value = self.pop("email") if self.inpublisher: - self._save_author('email', value, 'publisher') + self._save_author("email", value, "publisher") elif self.inauthor: - self._save_author('email', value) + self._save_author("email", value) elif self.incontributor: - self._save_contributor('email', value) + self._save_contributor("email", value) def _start_subtitle(self, attrs_d): - self.push_content('subtitle', attrs_d, 'text/plain', 1) + self.push_content("subtitle", attrs_d, "text/plain", 1) + _start_tagline = _start_subtitle def _end_subtitle(self): - self.pop_content('subtitle') + self.pop_content("subtitle") + _end_tagline = _end_subtitle def _start_rights(self, attrs_d): - self.push_content('rights', attrs_d, 'text/plain', 1) + self.push_content("rights", attrs_d, "text/plain", 1) + _start_copyright = _start_rights def _end_rights(self): - self.pop_content('rights') + self.pop_content("rights") + _end_copyright = _end_rights def _start_item(self, attrs_d): self.entries.append(FeedParserDict()) - self.push('item', 0) + self.push("item", 0) self.inentry = 1 self.guidislink = 0 self.title_depth = -1 - id = self._get_attribute(attrs_d, 'rdf:about') + id = self._get_attribute(attrs_d, "rdf:about") if id: context = self._get_context() - context['id'] = id + context["id"] = id self._cdf_common(attrs_d) + _start_entry = _start_item def _end_item(self): - self.pop('item') + self.pop("item") self.inentry = 0 self.hasContent = 0 + _end_entry = _end_item def _start_language(self, attrs_d): - self.push('language', 1) + self.push("language", 1) def _end_language(self): - self.lang = self.pop('language') + self.lang = self.pop("language") def _start_webmaster(self, attrs_d): - self.push('publisher', 1) + self.push("publisher", 1) def _end_webmaster(self): - self.pop('publisher') - self._sync_author_detail('publisher') + self.pop("publisher") + self._sync_author_detail("publisher") def _start_published(self, attrs_d): - self.push('published', 1) + self.push("published", 1) + _start_issued = _start_published _start_pubdate = _start_published def _end_published(self): - value = self.pop('published') - self._save('published_parsed', _parse_date(value), overwrite=True) + value = self.pop("published") + self._save("published_parsed", _parse_date(value), overwrite=True) + _end_issued = _end_published _end_pubdate = _end_published def _start_updated(self, attrs_d): - self.push('updated', 1) + self.push("updated", 1) + _start_modified = _start_updated _start_lastbuilddate = _start_updated def _end_updated(self): - value = self.pop('updated') + value = self.pop("updated") parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value, overwrite=True) + self._save("updated_parsed", parsed_value, overwrite=True) + _end_modified = _end_updated _end_lastbuilddate = _end_updated def _start_created(self, attrs_d): - self.push('created', 1) + self.push("created", 1) def _end_created(self): - value = self.pop('created') - self._save('created_parsed', _parse_date(value), overwrite=True) + value = self.pop("created") + self._save("created_parsed", _parse_date(value), overwrite=True) def _start_expirationdate(self, attrs_d): - self.push('expired', 1) + self.push("expired", 1) def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) + self._save("expired_parsed", _parse_date(self.pop("expired")), overwrite=True) def _start_category(self, attrs_d): - term = attrs_d.get('term') - scheme = attrs_d.get('scheme', attrs_d.get('domain')) - label = attrs_d.get('label') + term = attrs_d.get("term") + scheme = attrs_d.get("scheme", attrs_d.get("domain")) + label = attrs_d.get("label") self._add_tag(term, scheme, label) - self.push('category', 1) + self.push("category", 1) + _start_keywords = _start_category def _end_category(self): - value = self.pop('category') + value = self.pop("category") if not value: return context = self._get_context() - tags = context['tags'] - if value and len(tags) and not tags[-1]['term']: - tags[-1]['term'] = value + tags = context["tags"] + if value and len(tags) and not tags[-1]["term"]: + tags[-1]["term"] = value else: self._add_tag(value, None, None) + _end_keywords = _end_category def _start_cloud(self, attrs_d): - self._get_context()['cloud'] = FeedParserDict(attrs_d) + self._get_context()["cloud"] = FeedParserDict(attrs_d) def _start_link(self, attrs_d): - attrs_d.setdefault('rel', 'alternate') - if attrs_d['rel'] == 'self': - attrs_d.setdefault('type', 'application/atom+xml') + attrs_d.setdefault("rel", "alternate") + if attrs_d["rel"] == "self": + attrs_d.setdefault("type", "application/atom+xml") else: - attrs_d.setdefault('type', 'text/html') + attrs_d.setdefault("type", "text/html") context = self._get_context() attrs_d = self._enforce_href(attrs_d) - if 'href' in attrs_d: - attrs_d['href'] = self.resolve_uri(attrs_d['href']) + if "href" in attrs_d: + attrs_d["href"] = self.resolve_uri(attrs_d["href"]) expecting_text = self.infeed or self.inentry or self.insource - context.setdefault('links', []) + context.setdefault("links", []) if not (self.inentry and self.inimage): - context['links'].append(FeedParserDict(attrs_d)) - if 'href' in attrs_d: + context["links"].append(FeedParserDict(attrs_d)) + if "href" in attrs_d: if ( - attrs_d.get('rel') == 'alternate' - and self.map_content_type(attrs_d.get('type')) in self.html_types + attrs_d.get("rel") == "alternate" + and self.map_content_type(attrs_d.get("type")) in self.html_types ): - context['link'] = attrs_d['href'] + context["link"] = attrs_d["href"] else: - self.push('link', expecting_text) + self.push("link", expecting_text) def _end_link(self): - self.pop('link') + self.pop("link") def _start_guid(self, attrs_d): - self.guidislink = (attrs_d.get('ispermalink', 'true') == 'true') - self.push('id', 1) + self.guidislink = attrs_d.get("ispermalink", "true") == "true" + self.push("id", 1) + _start_id = _start_guid def _end_guid(self): - value = self.pop('id') - self._save('guidislink', self.guidislink and 'link' not in self._get_context()) + value = self.pop("id") + self._save("guidislink", self.guidislink and "link" not in self._get_context()) if self.guidislink: # guid acts as link, but only if 'ispermalink' is not present or is 'true', # and only if the item doesn't already have a link element - self._save('link', value) + self._save("link", value) + _end_id = _end_guid def _start_title(self, attrs_d): if self.svgOK: - return self.unknown_starttag('title', list(attrs_d.items())) - self.push_content('title', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource) + return self.unknown_starttag("title", list(attrs_d.items())) + self.push_content( + "title", attrs_d, "text/plain", self.infeed or self.inentry or self.insource + ) def _end_title(self): if self.svgOK: return - value = self.pop_content('title') + value = self.pop_content("title") if not value: return self.title_depth = self.depth def _start_description(self, attrs_d): context = self._get_context() - if 'summary' in context and not self.hasContent: - self._summaryKey = 'content' + if "summary" in context and not self.hasContent: + self._summaryKey = "content" self._start_content(attrs_d) else: - self.push_content('description', attrs_d, 'text/html', self.infeed or self.inentry or self.insource) + self.push_content( + "description", + attrs_d, + "text/html", + self.infeed or self.inentry or self.insource, + ) def _start_abstract(self, attrs_d): - self.push_content('description', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource) + self.push_content( + "description", + attrs_d, + "text/plain", + self.infeed or self.inentry or self.insource, + ) def _end_description(self): - if self._summaryKey == 'content': + if self._summaryKey == "content": self._end_content() else: - self.pop_content('description') + self.pop_content("description") self._summaryKey = None + _end_abstract = _end_description def _start_info(self, attrs_d): - self.push_content('info', attrs_d, 'text/plain', 1) + self.push_content("info", attrs_d, "text/plain", 1) + _start_feedburner_browserfriendly = _start_info def _end_info(self): - self.pop_content('info') + self.pop_content("info") + _end_feedburner_browserfriendly = _end_info def _start_generator(self, attrs_d): if attrs_d: attrs_d = self._enforce_href(attrs_d) - if 'href' in attrs_d: - attrs_d['href'] = self.resolve_uri(attrs_d['href']) - self._get_context()['generator_detail'] = FeedParserDict(attrs_d) - self.push('generator', 1) + if "href" in attrs_d: + attrs_d["href"] = self.resolve_uri(attrs_d["href"]) + self._get_context()["generator_detail"] = FeedParserDict(attrs_d) + self.push("generator", 1) def _end_generator(self): - value = self.pop('generator') + value = self.pop("generator") context = self._get_context() - if 'generator_detail' in context: - context['generator_detail']['name'] = value + if "generator_detail" in context: + context["generator_detail"]["name"] = value def _start_summary(self, attrs_d): context = self._get_context() - if 'summary' in context and not self.hasContent: - self._summaryKey = 'content' + if "summary" in context and not self.hasContent: + self._summaryKey = "content" self._start_content(attrs_d) else: - self._summaryKey = 'summary' - self.push_content(self._summaryKey, attrs_d, 'text/plain', 1) + self._summaryKey = "summary" + self.push_content(self._summaryKey, attrs_d, "text/plain", 1) def _end_summary(self): - if self._summaryKey == 'content': + if self._summaryKey == "content": self._end_content() else: - self.pop_content(self._summaryKey or 'summary') + self.pop_content(self._summaryKey or "summary") self._summaryKey = None def _start_enclosure(self, attrs_d): attrs_d = self._enforce_href(attrs_d) context = self._get_context() - attrs_d['rel'] = 'enclosure' - context.setdefault('links', []).append(FeedParserDict(attrs_d)) + attrs_d["rel"] = "enclosure" + context.setdefault("links", []).append(FeedParserDict(attrs_d)) def _start_source(self, attrs_d): - if 'url' in attrs_d: + if "url" in attrs_d: # This means that we're processing a source element from an RSS 2.0 feed - self.sourcedata['href'] = attrs_d['url'] - self.push('source', 1) + self.sourcedata["href"] = attrs_d["url"] + self.push("source", 1) self.insource = 1 self.title_depth = -1 def _end_source(self): self.insource = 0 - value = self.pop('source') + value = self.pop("source") if value: - self.sourcedata['title'] = value - self._get_context()['source'] = copy.deepcopy(self.sourcedata) + self.sourcedata["title"] = value + self._get_context()["source"] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() def _start_content(self, attrs_d): self.hasContent = 1 - self.push_content('content', attrs_d, 'text/plain', 1) - src = attrs_d.get('src') + self.push_content("content", attrs_d, "text/plain", 1) + src = attrs_d.get("src") if src: - self.contentparams['src'] = src - self.push('content', 1) + self.contentparams["src"] = src + self.push("content", 1) def _start_body(self, attrs_d): - self.push_content('content', attrs_d, 'application/xhtml+xml', 1) + self.push_content("content", attrs_d, "application/xhtml+xml", 1) + _start_xhtml_body = _start_body def _start_content_encoded(self, attrs_d): self.hasContent = 1 - self.push_content('content', attrs_d, 'text/html', 1) + self.push_content("content", attrs_d, "text/html", 1) + _start_fullitem = _start_content_encoded def _end_content(self): - copyToSummary = self.map_content_type(self.contentparams.get('type')) in ({'text/plain'} | self.html_types) - value = self.pop_content('content') + copyToSummary = self.map_content_type(self.contentparams.get("type")) in ( + {"text/plain"} | self.html_types + ) + value = self.pop_content("content") if copyToSummary: - self._save('summary', value) + self._save("summary", value) _end_body = _end_content _end_xhtml_body = _end_content @@ -495,12 +533,12 @@ class Namespace(object): _end_fullitem = _end_content def _start_newlocation(self, attrs_d): - self.push('newlocation', 1) + self.push("newlocation", 1) def _end_newlocation(self): - url = self.pop('newlocation') + url = self.pop("newlocation") context = self._get_context() # don't set newlocation if the context isn't right if context is not self.feeddata: return - context['newlocation'] = make_safe_absolute_uri(self.baseuri, url.strip()) + context["newlocation"] = make_safe_absolute_uri(self.baseuri, url.strip()) diff --git a/lib/feedparser/namespaces/admin.py b/lib/feedparser/namespaces/admin.py index 74218348..47fa1d5a 100644 --- a/lib/feedparser/namespaces/admin.py +++ b/lib/feedparser/namespaces/admin.py @@ -1,5 +1,5 @@ # Support for the administrative elements extension -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -29,25 +29,25 @@ from ..util import FeedParserDict -class Namespace(object): +class Namespace: # RDF Site Summary 1.0 Modules: Administrative # http://web.resource.org/rss/1.0/modules/admin/ supported_namespaces = { - 'http://webns.net/mvcb/': 'admin', + "http://webns.net/mvcb/": "admin", } def _start_admin_generatoragent(self, attrs_d): - self.push('generator', 1) - value = self._get_attribute(attrs_d, 'rdf:resource') + self.push("generator", 1) + value = self._get_attribute(attrs_d, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('generator') - self._get_context()['generator_detail'] = FeedParserDict({'href': value}) + self.pop("generator") + self._get_context()["generator_detail"] = FeedParserDict({"href": value}) def _start_admin_errorreportsto(self, attrs_d): - self.push('errorreportsto', 1) - value = self._get_attribute(attrs_d, 'rdf:resource') + self.push("errorreportsto", 1) + value = self._get_attribute(attrs_d, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('errorreportsto') + self.pop("errorreportsto") diff --git a/lib/feedparser/namespaces/cc.py b/lib/feedparser/namespaces/cc.py index 6735c5fe..bbe3bed2 100644 --- a/lib/feedparser/namespaces/cc.py +++ b/lib/feedparser/namespaces/cc.py @@ -1,5 +1,5 @@ # Support for the Creative Commons licensing extensions -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -29,41 +29,42 @@ from ..util import FeedParserDict -class Namespace(object): +class Namespace: supported_namespaces = { # RDF-based namespace - 'http://creativecommons.org/ns#license': 'cc', - + "http://creativecommons.org/ns#license": "cc", # Old RDF-based namespace - 'http://web.resource.org/cc/': 'cc', - + "http://web.resource.org/cc/": "cc", # RSS-based namespace - 'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativecommons', - + "http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": ( + "creativecommons" + ), # Old RSS-based namespace - 'http://backend.userland.com/creativeCommonsRssModule': 'creativecommons', + "http://backend.userland.com/creativeCommonsRssModule": "creativecommons", } def _start_cc_license(self, attrs_d): context = self._get_context() - value = self._get_attribute(attrs_d, 'rdf:resource') + value = self._get_attribute(attrs_d, "rdf:resource") attrs_d = FeedParserDict() - attrs_d['rel'] = 'license' + attrs_d["rel"] = "license" if value: - attrs_d['href'] = value - context.setdefault('links', []).append(attrs_d) + attrs_d["href"] = value + context.setdefault("links", []).append(attrs_d) def _start_creativecommons_license(self, attrs_d): - self.push('license', 1) + self.push("license", 1) + _start_creativeCommons_license = _start_creativecommons_license def _end_creativecommons_license(self): - value = self.pop('license') + value = self.pop("license") context = self._get_context() attrs_d = FeedParserDict() - attrs_d['rel'] = 'license' + attrs_d["rel"] = "license" if value: - attrs_d['href'] = value - context.setdefault('links', []).append(attrs_d) - del context['license'] + attrs_d["href"] = value + context.setdefault("links", []).append(attrs_d) + del context["license"] + _end_creativeCommons_license = _end_creativecommons_license diff --git a/lib/feedparser/namespaces/dc.py b/lib/feedparser/namespaces/dc.py index a89221d2..b0275325 100644 --- a/lib/feedparser/namespaces/dc.py +++ b/lib/feedparser/namespaces/dc.py @@ -1,5 +1,5 @@ # Support for the Dublin Core metadata extensions -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -30,10 +30,10 @@ from ..datetimes import _parse_date from ..util import FeedParserDict -class Namespace(object): +class Namespace: supported_namespaces = { - 'http://purl.org/dc/elements/1.1/': 'dc', - 'http://purl.org/dc/terms/': 'dcterms', + "http://purl.org/dc/elements/1.1/": "dc", + "http://purl.org/dc/terms/": "dcterms", } def _end_dc_author(self): @@ -109,25 +109,29 @@ class Namespace(object): self._start_updated(attrs_d) def _start_dcterms_valid(self, attrs_d): - self.push('validity', 1) + self.push("validity", 1) def _end_dcterms_valid(self): - for validity_detail in self.pop('validity').split(';'): - if '=' in validity_detail: - key, value = validity_detail.split('=', 1) - if key == 'start': - self._save('validity_start', value, overwrite=True) - self._save('validity_start_parsed', _parse_date(value), overwrite=True) - elif key == 'end': - self._save('validity_end', value, overwrite=True) - self._save('validity_end_parsed', _parse_date(value), overwrite=True) + for validity_detail in self.pop("validity").split(";"): + if "=" in validity_detail: + key, value = validity_detail.split("=", 1) + if key == "start": + self._save("validity_start", value, overwrite=True) + self._save( + "validity_start_parsed", _parse_date(value), overwrite=True + ) + elif key == "end": + self._save("validity_end", value, overwrite=True) + self._save( + "validity_end_parsed", _parse_date(value), overwrite=True + ) def _start_dc_contributor(self, attrs_d): self.incontributor = 1 context = self._get_context() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('name', 0) + context.setdefault("contributors", []) + context["contributors"].append(FeedParserDict()) + self.push("name", 0) def _end_dc_contributor(self): self._end_name() diff --git a/lib/feedparser/namespaces/georss.py b/lib/feedparser/namespaces/georss.py index 786a926f..93a508e8 100644 --- a/lib/feedparser/namespaces/georss.py +++ b/lib/feedparser/namespaces/georss.py @@ -1,5 +1,5 @@ # Support for the GeoRSS format -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,27 +26,24 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -# Required for Python 3.6 compatibility. -from __future__ import generator_stop - from ..util import FeedParserDict -class Namespace(object): +class Namespace: supported_namespaces = { - 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', - 'http://www.georss.org/georss': 'georss', - 'http://www.opengis.net/gml': 'gml', + "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo", + "http://www.georss.org/georss": "georss", + "http://www.opengis.net/gml": "gml", } def __init__(self): self.ingeometry = 0 - super(Namespace, self).__init__() + super().__init__() def _start_georssgeom(self, attrs_d): - self.push('geometry', 0) + self.push("geometry", 0) context = self._get_context() - context['where'] = FeedParserDict() + context["where"] = FeedParserDict() _start_georss_point = _start_georssgeom _start_georss_line = _start_georssgeom @@ -55,76 +52,77 @@ class Namespace(object): def _save_where(self, geometry): context = self._get_context() - context['where'].update(geometry) + context["where"].update(geometry) def _end_georss_point(self): - geometry = _parse_georss_point(self.pop('geometry')) + geometry = _parse_georss_point(self.pop("geometry")) if geometry: self._save_where(geometry) def _end_georss_line(self): - geometry = _parse_georss_line(self.pop('geometry')) + geometry = _parse_georss_line(self.pop("geometry")) if geometry: self._save_where(geometry) def _end_georss_polygon(self): - this = self.pop('geometry') + this = self.pop("geometry") geometry = _parse_georss_polygon(this) if geometry: self._save_where(geometry) def _end_georss_box(self): - geometry = _parse_georss_box(self.pop('geometry')) + geometry = _parse_georss_box(self.pop("geometry")) if geometry: self._save_where(geometry) def _start_where(self, attrs_d): - self.push('where', 0) + self.push("where", 0) context = self._get_context() - context['where'] = FeedParserDict() + context["where"] = FeedParserDict() + _start_georss_where = _start_where def _parse_srs_attrs(self, attrs_d): - srs_name = attrs_d.get('srsname') + srs_name = attrs_d.get("srsname") try: - srs_dimension = int(attrs_d.get('srsdimension', '2')) + srs_dimension = int(attrs_d.get("srsdimension", "2")) except ValueError: srs_dimension = 2 context = self._get_context() - if 'where' not in context: - context['where'] = {} - context['where']['srsName'] = srs_name - context['where']['srsDimension'] = srs_dimension + if "where" not in context: + context["where"] = {} + context["where"]["srsName"] = srs_name + context["where"]["srsDimension"] = srs_dimension def _start_gml_point(self, attrs_d): self._parse_srs_attrs(attrs_d) self.ingeometry = 1 - self.push('geometry', 0) + self.push("geometry", 0) def _start_gml_linestring(self, attrs_d): self._parse_srs_attrs(attrs_d) - self.ingeometry = 'linestring' - self.push('geometry', 0) + self.ingeometry = "linestring" + self.push("geometry", 0) def _start_gml_polygon(self, attrs_d): self._parse_srs_attrs(attrs_d) - self.push('geometry', 0) + self.push("geometry", 0) def _start_gml_exterior(self, attrs_d): - self.push('geometry', 0) + self.push("geometry", 0) def _start_gml_linearring(self, attrs_d): - self.ingeometry = 'polygon' - self.push('geometry', 0) + self.ingeometry = "polygon" + self.push("geometry", 0) def _start_gml_pos(self, attrs_d): - self.push('pos', 0) + self.push("pos", 0) def _end_gml_pos(self): - this = self.pop('pos') + this = self.pop("pos") context = self._get_context() - srs_name = context['where'].get('srsName') - srs_dimension = context['where'].get('srsDimension', 2) + srs_name = context["where"].get("srsName") + srs_dimension = context["where"].get("srsDimension", 2) swap = True if srs_name and "EPSG" in srs_name: epsg = int(srs_name.split(":")[-1]) @@ -134,25 +132,25 @@ class Namespace(object): self._save_where(geometry) def _start_gml_poslist(self, attrs_d): - self.push('pos', 0) + self.push("pos", 0) def _end_gml_poslist(self): - this = self.pop('pos') + this = self.pop("pos") context = self._get_context() - srs_name = context['where'].get('srsName') - srs_dimension = context['where'].get('srsDimension', 2) + srs_name = context["where"].get("srsName") + srs_dimension = context["where"].get("srsDimension", 2) swap = True if srs_name and "EPSG" in srs_name: epsg = int(srs_name.split(":")[-1]) swap = bool(epsg in _geogCS) - geometry = _parse_poslist( - this, self.ingeometry, swap=swap, dims=srs_dimension) + geometry = _parse_poslist(this, self.ingeometry, swap=swap, dims=srs_dimension) if geometry: self._save_where(geometry) def _end_geom(self): self.ingeometry = 0 - self.pop('geometry') + self.pop("geometry") + _end_gml_point = _end_geom _end_gml_linestring = _end_geom _end_gml_linearring = _end_geom @@ -160,19 +158,21 @@ class Namespace(object): _end_gml_polygon = _end_geom def _end_where(self): - self.pop('where') + self.pop("where") + _end_georss_where = _end_where # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' # items, or None in the case of a parsing error. + def _parse_poslist(value, geom_type, swap=True, dims=2): - if geom_type == 'linestring': + if geom_type == "linestring": return _parse_georss_line(value, swap, dims) - elif geom_type == 'polygon': + elif geom_type == "polygon": ring = _parse_georss_line(value, swap, dims) - return {'type': 'Polygon', 'coordinates': (ring['coordinates'],)} + return {"type": "Polygon", "coordinates": (ring["coordinates"],)} else: return None @@ -180,10 +180,10 @@ def _parse_poslist(value, geom_type, swap=True, dims=2): def _gen_georss_coords(value, swap=True, dims=2): # A generator of (lon, lat) pairs from a string of encoded GeoRSS # coordinates. Converts to floats and swaps order. - latlons = (float(ll) for ll in value.replace(',', ' ').split()) + latlons = (float(ll) for ll in value.replace(",", " ").split()) while True: try: - t = [next(latlons), next(latlons)][::swap and -1 or 1] + t = [next(latlons), next(latlons)][:: swap and -1 or 1] if dims == 3: t.append(next(latlons)) yield tuple(t) @@ -196,7 +196,7 @@ def _parse_georss_point(value, swap=True, dims=2): # whitespace. We'll also handle comma separators. try: coords = list(_gen_georss_coords(value, swap, dims)) - return {'type': 'Point', 'coordinates': coords[0]} + return {"type": "Point", "coordinates": coords[0]} except (IndexError, ValueError): return None @@ -207,7 +207,7 @@ def _parse_georss_line(value, swap=True, dims=2): # whitespace. There must be at least two pairs. try: coords = list(_gen_georss_coords(value, swap, dims)) - return {'type': 'LineString', 'coordinates': coords} + return {"type": "LineString", "coordinates": coords} except (IndexError, ValueError): return None @@ -223,7 +223,7 @@ def _parse_georss_polygon(value, swap=True, dims=2): return None if len(ring) < 4: return None - return {'type': 'Polygon', 'coordinates': (ring,)} + return {"type": "Polygon", "coordinates": (ring,)} def _parse_georss_box(value, swap=True, dims=2): @@ -233,7 +233,7 @@ def _parse_georss_box(value, swap=True, dims=2): # first pair is the lower corner, the second is the upper corner. try: coords = list(_gen_georss_coords(value, swap, dims)) - return {'type': 'Box', 'coordinates': tuple(coords)} + return {"type": "Box", "coordinates": tuple(coords)} except (IndexError, ValueError): return None @@ -241,38 +241,443 @@ def _parse_georss_box(value, swap=True, dims=2): # The list of EPSG codes for geographic (latitude/longitude) coordinate # systems to support decoding of GeoRSS GML profiles. _geogCS = [ - 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, - 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022, - 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, - 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081, - 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, - 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, - 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, - 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, - 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, - 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, - 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, - 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, - 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, - 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, - 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, - 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, - 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293, - 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307, - 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322, - 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603, - 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, - 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, - 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, - 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, - 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, - 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, - 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, - 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, - 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, - 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, - 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, - 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804, - 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818, - 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979, + 3819, + 3821, + 3824, + 3889, + 3906, + 4001, + 4002, + 4003, + 4004, + 4005, + 4006, + 4007, + 4008, + 4009, + 4010, + 4011, + 4012, + 4013, + 4014, + 4015, + 4016, + 4018, + 4019, + 4020, + 4021, + 4022, + 4023, + 4024, + 4025, + 4027, + 4028, + 4029, + 4030, + 4031, + 4032, + 4033, + 4034, + 4035, + 4036, + 4041, + 4042, + 4043, + 4044, + 4045, + 4046, + 4047, + 4052, + 4053, + 4054, + 4055, + 4075, + 4081, + 4120, + 4121, + 4122, + 4123, + 4124, + 4125, + 4126, + 4127, + 4128, + 4129, + 4130, + 4131, + 4132, + 4133, + 4134, + 4135, + 4136, + 4137, + 4138, + 4139, + 4140, + 4141, + 4142, + 4143, + 4144, + 4145, + 4146, + 4147, + 4148, + 4149, + 4150, + 4151, + 4152, + 4153, + 4154, + 4155, + 4156, + 4157, + 4158, + 4159, + 4160, + 4161, + 4162, + 4163, + 4164, + 4165, + 4166, + 4167, + 4168, + 4169, + 4170, + 4171, + 4172, + 4173, + 4174, + 4175, + 4176, + 4178, + 4179, + 4180, + 4181, + 4182, + 4183, + 4184, + 4185, + 4188, + 4189, + 4190, + 4191, + 4192, + 4193, + 4194, + 4195, + 4196, + 4197, + 4198, + 4199, + 4200, + 4201, + 4202, + 4203, + 4204, + 4205, + 4206, + 4207, + 4208, + 4209, + 4210, + 4211, + 4212, + 4213, + 4214, + 4215, + 4216, + 4218, + 4219, + 4220, + 4221, + 4222, + 4223, + 4224, + 4225, + 4226, + 4227, + 4228, + 4229, + 4230, + 4231, + 4232, + 4233, + 4234, + 4235, + 4236, + 4237, + 4238, + 4239, + 4240, + 4241, + 4242, + 4243, + 4244, + 4245, + 4246, + 4247, + 4248, + 4249, + 4250, + 4251, + 4252, + 4253, + 4254, + 4255, + 4256, + 4257, + 4258, + 4259, + 4260, + 4261, + 4262, + 4263, + 4264, + 4265, + 4266, + 4267, + 4268, + 4269, + 4270, + 4271, + 4272, + 4273, + 4274, + 4275, + 4276, + 4277, + 4278, + 4279, + 4280, + 4281, + 4282, + 4283, + 4284, + 4285, + 4286, + 4287, + 4288, + 4289, + 4291, + 4292, + 4293, + 4294, + 4295, + 4296, + 4297, + 4298, + 4299, + 4300, + 4301, + 4302, + 4303, + 4304, + 4306, + 4307, + 4308, + 4309, + 4310, + 4311, + 4312, + 4313, + 4314, + 4315, + 4316, + 4317, + 4318, + 4319, + 4322, + 4324, + 4326, + 4463, + 4470, + 4475, + 4483, + 4490, + 4555, + 4558, + 4600, + 4601, + 4602, + 4603, + 4604, + 4605, + 4606, + 4607, + 4608, + 4609, + 4610, + 4611, + 4612, + 4613, + 4614, + 4615, + 4616, + 4617, + 4618, + 4619, + 4620, + 4621, + 4622, + 4623, + 4624, + 4625, + 4626, + 4627, + 4628, + 4629, + 4630, + 4631, + 4632, + 4633, + 4634, + 4635, + 4636, + 4637, + 4638, + 4639, + 4640, + 4641, + 4642, + 4643, + 4644, + 4645, + 4646, + 4657, + 4658, + 4659, + 4660, + 4661, + 4662, + 4663, + 4664, + 4665, + 4666, + 4667, + 4668, + 4669, + 4670, + 4671, + 4672, + 4673, + 4674, + 4675, + 4676, + 4677, + 4678, + 4679, + 4680, + 4681, + 4682, + 4683, + 4684, + 4685, + 4686, + 4687, + 4688, + 4689, + 4690, + 4691, + 4692, + 4693, + 4694, + 4695, + 4696, + 4697, + 4698, + 4699, + 4700, + 4701, + 4702, + 4703, + 4704, + 4705, + 4706, + 4707, + 4708, + 4709, + 4710, + 4711, + 4712, + 4713, + 4714, + 4715, + 4716, + 4717, + 4718, + 4719, + 4720, + 4721, + 4722, + 4723, + 4724, + 4725, + 4726, + 4727, + 4728, + 4729, + 4730, + 4731, + 4732, + 4733, + 4734, + 4735, + 4736, + 4737, + 4738, + 4739, + 4740, + 4741, + 4742, + 4743, + 4744, + 4745, + 4746, + 4747, + 4748, + 4749, + 4750, + 4751, + 4752, + 4753, + 4754, + 4755, + 4756, + 4757, + 4758, + 4759, + 4760, + 4761, + 4762, + 4763, + 4764, + 4765, + 4801, + 4802, + 4803, + 4804, + 4805, + 4806, + 4807, + 4808, + 4809, + 4810, + 4811, + 4813, + 4814, + 4815, + 4816, + 4817, + 4818, + 4819, + 4820, + 4821, + 4823, + 4824, + 4901, + 4902, + 4903, + 4904, + 4979, ] diff --git a/lib/feedparser/namespaces/itunes.py b/lib/feedparser/namespaces/itunes.py index a50a0ea8..952f37a9 100644 --- a/lib/feedparser/namespaces/itunes.py +++ b/lib/feedparser/namespaces/itunes.py @@ -1,5 +1,5 @@ # Support for the iTunes format -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -29,13 +29,12 @@ from ..util import FeedParserDict -class Namespace(object): +class Namespace: supported_namespaces = { # Canonical namespace - 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', - + "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes", # Extra namespace - 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', + "http://example.com/DTDs/PodCast-1.0.dtd": "itunes", } def _start_itunes_author(self, attrs_d): @@ -73,37 +72,42 @@ class Namespace(object): def _start_itunes_owner(self, attrs_d): self.inpublisher = 1 - self.push('publisher', 0) + self.push("publisher", 0) def _end_itunes_owner(self): - self.pop('publisher') + self.pop("publisher") self.inpublisher = 0 - self._sync_author_detail('publisher') + self._sync_author_detail("publisher") def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(','): + for term in self.pop("itunes_keywords").split(","): if term.strip(): - self._add_tag(term.strip(), 'http://www.itunes.com/', None) + self._add_tag(term.strip(), "http://www.itunes.com/", None) def _start_itunes_category(self, attrs_d): - self._add_tag(attrs_d.get('text'), 'http://www.itunes.com/', None) - self.push('category', 1) + self._add_tag(attrs_d.get("text"), "http://www.itunes.com/", None) + self.push("category", 1) def _start_itunes_image(self, attrs_d): - self.push('itunes_image', 0) - if attrs_d.get('href'): - self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('href')}) - elif attrs_d.get('url'): - self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('url')}) + self.push("itunes_image", 0) + if attrs_d.get("href"): + self._get_context()["image"] = FeedParserDict({"href": attrs_d.get("href")}) + elif attrs_d.get("url"): + self._get_context()["image"] = FeedParserDict({"href": attrs_d.get("url")}) + _start_itunes_link = _start_itunes_image def _end_itunes_block(self): - value = self.pop('itunes_block', 0) - self._get_context()['itunes_block'] = (value == 'yes' or value == 'Yes') and 1 or 0 + value = self.pop("itunes_block", 0) + self._get_context()["itunes_block"] = ( + (value == "yes" or value == "Yes") and 1 or 0 + ) def _end_itunes_explicit(self): - value = self.pop('itunes_explicit', 0) + value = self.pop("itunes_explicit", 0) # Convert 'yes' -> True, 'clean' to False, and any other value to None # False and None both evaluate as False, so the difference can be ignored # by applications that only need to know if the content is explicit. - self._get_context()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] + self._get_context()["itunes_explicit"] = (None, False, True)[ + (value == "yes" and 2) or value == "clean" or 0 + ] diff --git a/lib/feedparser/namespaces/mediarss.py b/lib/feedparser/namespaces/mediarss.py index 2298ad2f..5ec4b67b 100644 --- a/lib/feedparser/namespaces/mediarss.py +++ b/lib/feedparser/namespaces/mediarss.py @@ -1,5 +1,5 @@ # Support for the Media RSS format -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -29,24 +29,23 @@ from ..util import FeedParserDict -class Namespace(object): +class Namespace: supported_namespaces = { # Canonical namespace - 'http://search.yahoo.com/mrss/': 'media', - + "http://search.yahoo.com/mrss/": "media", # Old namespace (no trailing slash) - 'http://search.yahoo.com/mrss': 'media', + "http://search.yahoo.com/mrss": "media", } def _start_media_category(self, attrs_d): - attrs_d.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') + attrs_d.setdefault("scheme", "http://search.yahoo.com/mrss/category_schema") self._start_category(attrs_d) def _end_media_category(self): self._end_category() def _end_media_keywords(self): - for term in self.pop('media_keywords').split(','): + for term in self.pop("media_keywords").split(","): if term.strip(): self._add_tag(term.strip(), None, None) @@ -64,26 +63,26 @@ class Namespace(object): def _start_media_rating(self, attrs_d): context = self._get_context() - context.setdefault('media_rating', attrs_d) - self.push('rating', 1) + context.setdefault("media_rating", attrs_d) + self.push("rating", 1) def _end_media_rating(self): - rating = self.pop('rating') + rating = self.pop("rating") if rating is not None and rating.strip(): context = self._get_context() - context['media_rating']['content'] = rating + context["media_rating"]["content"] = rating def _start_media_credit(self, attrs_d): context = self._get_context() - context.setdefault('media_credit', []) - context['media_credit'].append(attrs_d) - self.push('credit', 1) + context.setdefault("media_credit", []) + context["media_credit"].append(attrs_d) + self.push("credit", 1) def _end_media_credit(self): - credit = self.pop('credit') + credit = self.pop("credit") if credit is not None and credit.strip(): context = self._get_context() - context['media_credit'][-1]['content'] = credit + context["media_credit"][-1]["content"] = credit def _start_media_description(self, attrs_d): self._start_description(attrs_d) @@ -93,49 +92,51 @@ class Namespace(object): def _start_media_restriction(self, attrs_d): context = self._get_context() - context.setdefault('media_restriction', attrs_d) - self.push('restriction', 1) + context.setdefault("media_restriction", attrs_d) + self.push("restriction", 1) def _end_media_restriction(self): - restriction = self.pop('restriction') + restriction = self.pop("restriction") if restriction is not None and restriction.strip(): context = self._get_context() - context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')] + context["media_restriction"]["content"] = [ + cc.strip().lower() for cc in restriction.split(" ") + ] def _start_media_license(self, attrs_d): context = self._get_context() - context.setdefault('media_license', attrs_d) - self.push('license', 1) + context.setdefault("media_license", attrs_d) + self.push("license", 1) def _end_media_license(self): - license_ = self.pop('license') + license_ = self.pop("license") if license_ is not None and license_.strip(): context = self._get_context() - context['media_license']['content'] = license_ + context["media_license"]["content"] = license_ def _start_media_content(self, attrs_d): context = self._get_context() - context.setdefault('media_content', []) - context['media_content'].append(attrs_d) + context.setdefault("media_content", []) + context["media_content"].append(attrs_d) def _start_media_thumbnail(self, attrs_d): context = self._get_context() - context.setdefault('media_thumbnail', []) - self.push('url', 1) # new - context['media_thumbnail'].append(attrs_d) + context.setdefault("media_thumbnail", []) + self.push("url", 1) # new + context["media_thumbnail"].append(attrs_d) def _end_media_thumbnail(self): - url = self.pop('url') + url = self.pop("url") context = self._get_context() if url is not None and url.strip(): - if 'url' not in context['media_thumbnail'][-1]: - context['media_thumbnail'][-1]['url'] = url + if "url" not in context["media_thumbnail"][-1]: + context["media_thumbnail"][-1]["url"] = url def _start_media_player(self, attrs_d): - self.push('media_player', 0) - self._get_context()['media_player'] = FeedParserDict(attrs_d) + self.push("media_player", 0) + self._get_context()["media_player"] = FeedParserDict(attrs_d) def _end_media_player(self): - value = self.pop('media_player') + value = self.pop("media_player") context = self._get_context() - context['media_player']['content'] = value + context["media_player"]["content"] = value diff --git a/lib/feedparser/namespaces/psc.py b/lib/feedparser/namespaces/psc.py index a440bd68..4a43d6df 100644 --- a/lib/feedparser/namespaces/psc.py +++ b/lib/feedparser/namespaces/psc.py @@ -1,5 +1,5 @@ # Support for the Podlove Simple Chapters format -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -32,36 +32,36 @@ import re from .. import util -class Namespace(object): +class Namespace: supported_namespaces = { - 'http://podlove.org/simple-chapters': 'psc', + "http://podlove.org/simple-chapters": "psc", } def __init__(self): # chapters will only be captured while psc_chapters_flag is True. self.psc_chapters_flag = False - super(Namespace, self).__init__() + super().__init__() def _start_psc_chapters(self, attrs_d): context = self._get_context() - if 'psc_chapters' not in context: + if "psc_chapters" not in context: self.psc_chapters_flag = True - attrs_d['chapters'] = [] - context['psc_chapters'] = util.FeedParserDict(attrs_d) + attrs_d["chapters"] = [] + context["psc_chapters"] = util.FeedParserDict(attrs_d) def _end_psc_chapters(self): self.psc_chapters_flag = False def _start_psc_chapter(self, attrs_d): if self.psc_chapters_flag: - start = self._get_attribute(attrs_d, 'start') - attrs_d['start_parsed'] = _parse_psc_chapter_start(start) + start = self._get_attribute(attrs_d, "start") + attrs_d["start_parsed"] = _parse_psc_chapter_start(start) - context = self._get_context()['psc_chapters'] - context['chapters'].append(util.FeedParserDict(attrs_d)) + context = self._get_context()["psc_chapters"] + context["chapters"].append(util.FeedParserDict(attrs_d)) -format_ = re.compile(r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$') +format_ = re.compile(r"^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$") def _parse_psc_chapter_start(start): @@ -71,4 +71,4 @@ def _parse_psc_chapter_start(start): _, h, m, s, _, ms = m.groups() h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0)) - return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000) + return datetime.timedelta(0, h * 60 * 60 + m * 60 + s, ms * 1000) diff --git a/lib/feedparser/parsers/json.py b/lib/feedparser/parsers/json.py index ae43163c..36f714a5 100644 --- a/lib/feedparser/parsers/json.py +++ b/lib/feedparser/parsers/json.py @@ -34,37 +34,37 @@ from ..util import FeedParserDict class JSONParser: VERSIONS = { - 'https://jsonfeed.org/version/1': 'json1', - 'https://jsonfeed.org/version/1.1': 'json11', + "https://jsonfeed.org/version/1": "json1", + "https://jsonfeed.org/version/1.1": "json11", } FEED_FIELDS = ( - ('title', 'title'), - ('icon', 'image'), - ('home_page_url', 'link'), - ('description', 'description'), + ("title", "title"), + ("icon", "image"), + ("home_page_url", "link"), + ("description", "description"), ) ITEM_FIELDS = ( - ('title', 'title'), - ('id', 'guid'), - ('url', 'link'), - ('summary', 'summary'), - ('external_url', 'source'), + ("title", "title"), + ("id", "guid"), + ("url", "link"), + ("summary", "summary"), + ("external_url", "source"), ) def __init__(self, baseuri=None, baselang=None, encoding=None): - self.baseuri = baseuri or '' + self.baseuri = baseuri or "" self.lang = baselang or None - self.encoding = encoding or 'utf-8' # character encoding + self.encoding = encoding or "utf-8" # character encoding self.version = None self.feeddata = FeedParserDict() self.namespacesInUse = [] self.entries = [] - def feed(self, data): - data = json.loads(data) + def feed(self, file): + data = json.load(file) - v = data.get('version', '') + v = data.get("version", "") try: self.version = self.VERSIONS[v] except KeyError: @@ -73,11 +73,11 @@ class JSONParser: for src, dst in self.FEED_FIELDS: if src in data: self.feeddata[dst] = data[src] - if 'author' in data: - self.parse_author(data['author'], self.feeddata) + if "author" in data: + self.parse_author(data["author"], self.feeddata) # TODO: hubs; expired has no RSS equivalent - self.entries = [self.parse_entry(e) for e in data['items']] + self.entries = [self.parse_entry(e) for e in data["items"]] def parse_entry(self, e): entry = FeedParserDict() @@ -85,49 +85,51 @@ class JSONParser: if src in e: entry[dst] = e[src] - if 'content_text' in e: - entry['content'] = c = FeedParserDict() - c['value'] = e['content_text'] - c['type'] = 'text' - elif 'content_html' in e: - entry['content'] = c = FeedParserDict() - c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json') - c['type'] = 'html' + if "content_text" in e: + entry["content"] = c = FeedParserDict() + c["value"] = e["content_text"] + c["type"] = "text" + elif "content_html" in e: + entry["content"] = c = FeedParserDict() + c["value"] = sanitize_html( + e["content_html"], self.encoding, "application/json" + ) + c["type"] = "html" - if 'date_published' in e: - entry['published'] = e['date_published'] - entry['published_parsed'] = _parse_date(e['date_published']) - if 'date_updated' in e: - entry['updated'] = e['date_modified'] - entry['updated_parsed'] = _parse_date(e['date_modified']) + if "date_published" in e: + entry["published"] = e["date_published"] + entry["published_parsed"] = _parse_date(e["date_published"]) + if "date_updated" in e: + entry["updated"] = e["date_modified"] + entry["updated_parsed"] = _parse_date(e["date_modified"]) - if 'tags' in e: - entry['category'] = e['tags'] + if "tags" in e: + entry["category"] = e["tags"] - if 'author' in e: - self.parse_author(e['author'], entry) + if "author" in e: + self.parse_author(e["author"], entry) - if 'attachments' in e: - entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']] + if "attachments" in e: + entry["enclosures"] = [self.parse_attachment(a) for a in e["attachments"]] return entry @staticmethod def parse_author(parent, dest): - dest['author_detail'] = detail = FeedParserDict() - if 'name' in parent: - dest['author'] = detail['name'] = parent['name'] - if 'url' in parent: - if parent['url'].startswith('mailto:'): - detail['email'] = parent['url'][7:] + dest["author_detail"] = detail = FeedParserDict() + if "name" in parent: + dest["author"] = detail["name"] = parent["name"] + if "url" in parent: + if parent["url"].startswith("mailto:"): + detail["email"] = parent["url"][7:] else: - detail['href'] = parent['url'] + detail["href"] = parent["url"] @staticmethod def parse_attachment(attachment): enc = FeedParserDict() - enc['href'] = attachment['url'] - enc['type'] = attachment['mime_type'] - if 'size_in_bytes' in attachment: - enc['length'] = attachment['size_in_bytes'] + enc["href"] = attachment["url"] + enc["type"] = attachment["mime_type"] + if "size_in_bytes" in attachment: + enc["length"] = attachment["size_in_bytes"] return enc diff --git a/lib/feedparser/parsers/loose.py b/lib/feedparser/parsers/loose.py index 3f22bfb4..57285077 100644 --- a/lib/feedparser/parsers/loose.py +++ b/lib/feedparser/parsers/loose.py @@ -1,5 +1,5 @@ # The loose feed parser that interfaces with an SGML parsing library -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -26,52 +26,50 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. + class LooseXMLParser: contentparams = None def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None): - self.baseuri = baseuri or '' + self.baseuri = baseuri or "" self.lang = baselang or None - self.encoding = encoding or 'utf-8' # character encoding + self.encoding = encoding or "utf-8" # character encoding self.entities = entities or {} super().__init__() @staticmethod def _normalize_attributes(kv): k = kv[0].lower() - v = k in ('rel', 'type') and kv[1].lower() or kv[1] + v = k in ("rel", "type") and kv[1].lower() or kv[1] # the sgml parser doesn't handle entities in attributes, nor # does it pass the attribute values through as unicode, while # strict xml parsers do -- account for this difference - v = v.replace('&', '&') + v = v.replace("&", "&") return k, v def decode_entities(self, element, data): - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace('"', '"') - data = data.replace(''', ''') - data = data.replace(''', ''') - if not self.contentparams.get('type', 'xml').endswith('xml'): - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace(''', "'") - data = data.replace('/', '/') - data = data.replace('/', '/') + data = data.replace("<", "<") + data = data.replace("<", "<") + data = data.replace("<", "<") + data = data.replace(">", ">") + data = data.replace(">", ">") + data = data.replace(">", ">") + data = data.replace("&", "&") + data = data.replace("&", "&") + data = data.replace(""", """) + data = data.replace(""", """) + data = data.replace("'", "'") + data = data.replace("'", "'") + if not self.contentparams.get("type", "xml").endswith("xml"): + data = data.replace("<", "<") + data = data.replace(">", ">") + data = data.replace("&", "&") + data = data.replace(""", '"') + data = data.replace("'", "'") + data = data.replace("/", "/") + data = data.replace("/", "/") return data @staticmethod def strattrs(attrs): - return ''.join( - ' %s="%s"' % (n, v.replace('"', '"')) - for n, v in attrs - ) + return "".join(' {}="{}"'.format(n, v.replace('"', """)) for n, v in attrs) diff --git a/lib/feedparser/parsers/strict.py b/lib/feedparser/parsers/strict.py index 7b0386e5..4f701985 100644 --- a/lib/feedparser/parsers/strict.py +++ b/lib/feedparser/parsers/strict.py @@ -1,5 +1,5 @@ # The strict feed parser that interfaces with an XML parsing library -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -34,15 +34,15 @@ class StrictXMLParser: self.bozo = 0 self.exc = None self.decls = {} - self.baseuri = baseuri or '' + self.baseuri = baseuri or "" self.lang = baselang self.encoding = encoding - super(StrictXMLParser, self).__init__() + super().__init__() @staticmethod def _normalize_attributes(kv): k = kv[0].lower() - v = k in ('rel', 'type') and kv[1].lower() or kv[1] + v = k in ("rel", "type") and kv[1].lower() or kv[1] return k, v def startPrefixMapping(self, prefix, uri): @@ -51,23 +51,29 @@ class StrictXMLParser: # Jython uses '' instead of None; standardize on None prefix = prefix or None self.track_namespace(prefix, uri) - if prefix and uri == 'http://www.w3.org/1999/xlink': - self.decls['xmlns:' + prefix] = uri + if prefix and uri == "http://www.w3.org/1999/xlink": + self.decls["xmlns:" + prefix] = uri def startElementNS(self, name, qname, attrs): namespace, localname = name - lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') != -1: + lowernamespace = str(namespace or "").lower() + if lowernamespace.find("backend.userland.com/rss") != -1: # match any backend.userland.com namespace - namespace = 'http://backend.userland.com/rss' + namespace = "http://backend.userland.com/rss" lowernamespace = namespace - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] + if qname and qname.find(":") > 0: + givenprefix = qname.split(":")[0] else: givenprefix = None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix is None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespaces_in_use: - raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) + if ( + givenprefix + and (prefix is None or (prefix == "" and lowernamespace == "")) + and givenprefix not in self.namespaces_in_use + ): + raise UndeclaredNamespace( + "'%s' is not associated with a namespace" % givenprefix + ) localname = str(localname).lower() # qname implementation is horribly broken in Python 2.1 (it @@ -78,24 +84,24 @@ class StrictXMLParser: # at all). Thanks to MatejC for helping me test this and # tirelessly telling me that it didn't work yet. attrsD, self.decls = self.decls, {} - if localname == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML': - attrsD['xmlns'] = namespace - if localname == 'svg' and namespace == 'http://www.w3.org/2000/svg': - attrsD['xmlns'] = namespace + if localname == "math" and namespace == "http://www.w3.org/1998/Math/MathML": + attrsD["xmlns"] = namespace + if localname == "svg" and namespace == "http://www.w3.org/2000/svg": + attrsD["xmlns"] = namespace if prefix: - localname = prefix.lower() + ':' + localname + localname = prefix.lower() + ":" + localname elif namespace and not qname: # Expat for name, value in self.namespaces_in_use.items(): if name and value == namespace: - localname = name + ':' + localname + localname = name + ":" + localname break for (namespace, attrlocalname), attrvalue in attrs.items(): - lowernamespace = (namespace or '').lower() - prefix = self._matchnamespaces.get(lowernamespace, '') + lowernamespace = (namespace or "").lower() + prefix = self._matchnamespaces.get(lowernamespace, "") if prefix: - attrlocalname = prefix + ':' + attrlocalname + attrlocalname = prefix + ":" + attrlocalname attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) @@ -107,18 +113,18 @@ class StrictXMLParser: def endElementNS(self, name, qname): namespace, localname = name - lowernamespace = str(namespace or '').lower() - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] + lowernamespace = str(namespace or "").lower() + if qname and qname.find(":") > 0: + givenprefix = qname.split(":")[0] else: - givenprefix = '' + givenprefix = "" prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if prefix: - localname = prefix + ':' + localname + localname = prefix + ":" + localname elif namespace and not qname: # Expat for name, value in self.namespaces_in_use.items(): if name and value == namespace: - localname = name + ':' + localname + localname = name + ":" + localname break localname = str(localname).lower() self.unknown_endtag(localname) diff --git a/lib/feedparser/py.typed b/lib/feedparser/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/lib/feedparser/sanitizer.py b/lib/feedparser/sanitizer.py index 5b729830..76be5700 100644 --- a/lib/feedparser/sanitizer.py +++ b/lib/feedparser/sanitizer.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -25,6 +25,8 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations + import re from .html import BaseHTMLProcessor @@ -33,705 +35,705 @@ from .urls import make_safe_absolute_uri class HTMLSanitizer(BaseHTMLProcessor): acceptable_elements = { - 'a', - 'abbr', - 'acronym', - 'address', - 'area', - 'article', - 'aside', - 'audio', - 'b', - 'big', - 'blockquote', - 'br', - 'button', - 'canvas', - 'caption', - 'center', - 'cite', - 'code', - 'col', - 'colgroup', - 'command', - 'datagrid', - 'datalist', - 'dd', - 'del', - 'details', - 'dfn', - 'dialog', - 'dir', - 'div', - 'dl', - 'dt', - 'em', - 'event-source', - 'fieldset', - 'figcaption', - 'figure', - 'font', - 'footer', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'header', - 'hr', - 'i', - 'img', - 'input', - 'ins', - 'kbd', - 'keygen', - 'label', - 'legend', - 'li', - 'm', - 'map', - 'menu', - 'meter', - 'multicol', - 'nav', - 'nextid', - 'noscript', - 'ol', - 'optgroup', - 'option', - 'output', - 'p', - 'pre', - 'progress', - 'q', - 's', - 'samp', - 'section', - 'select', - 'small', - 'sound', - 'source', - 'spacer', - 'span', - 'strike', - 'strong', - 'sub', - 'sup', - 'table', - 'tbody', - 'td', - 'textarea', - 'tfoot', - 'th', - 'thead', - 'time', - 'tr', - 'tt', - 'u', - 'ul', - 'var', - 'video', + "a", + "abbr", + "acronym", + "address", + "area", + "article", + "aside", + "audio", + "b", + "big", + "blockquote", + "br", + "button", + "canvas", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "command", + "datagrid", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "dir", + "div", + "dl", + "dt", + "em", + "event-source", + "fieldset", + "figcaption", + "figure", + "font", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hr", + "i", + "img", + "input", + "ins", + "kbd", + "keygen", + "label", + "legend", + "li", + "m", + "map", + "menu", + "meter", + "multicol", + "nav", + "nextid", + "noscript", + "ol", + "optgroup", + "option", + "output", + "p", + "pre", + "progress", + "q", + "s", + "samp", + "section", + "select", + "small", + "sound", + "source", + "spacer", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "tbody", + "td", + "textarea", + "tfoot", + "th", + "thead", + "time", + "tr", + "tt", + "u", + "ul", + "var", + "video", } acceptable_attributes = { - 'abbr', - 'accept', - 'accept-charset', - 'accesskey', - 'action', - 'align', - 'alt', - 'autocomplete', - 'autofocus', - 'axis', - 'background', - 'balance', - 'bgcolor', - 'bgproperties', - 'border', - 'bordercolor', - 'bordercolordark', - 'bordercolorlight', - 'bottompadding', - 'cellpadding', - 'cellspacing', - 'ch', - 'challenge', - 'char', - 'charoff', - 'charset', - 'checked', - 'choff', - 'cite', - 'class', - 'clear', - 'color', - 'cols', - 'colspan', - 'compact', - 'contenteditable', - 'controls', - 'coords', - 'data', - 'datafld', - 'datapagesize', - 'datasrc', - 'datetime', - 'default', - 'delay', - 'dir', - 'disabled', - 'draggable', - 'dynsrc', - 'enctype', - 'end', - 'face', - 'for', - 'form', - 'frame', - 'galleryimg', - 'gutter', - 'headers', - 'height', - 'hidden', - 'hidefocus', - 'high', - 'href', - 'hreflang', - 'hspace', - 'icon', - 'id', - 'inputmode', - 'ismap', - 'keytype', - 'label', - 'lang', - 'leftspacing', - 'list', - 'longdesc', - 'loop', - 'loopcount', - 'loopend', - 'loopstart', - 'low', - 'lowsrc', - 'max', - 'maxlength', - 'media', - 'method', - 'min', - 'multiple', - 'name', - 'nohref', - 'noshade', - 'nowrap', - 'open', - 'optimum', - 'pattern', - 'ping', - 'point-size', - 'poster', - 'pqg', - 'preload', - 'prompt', - 'radiogroup', - 'readonly', - 'rel', - 'repeat-max', - 'repeat-min', - 'replace', - 'required', - 'rev', - 'rightspacing', - 'rows', - 'rowspan', - 'rules', - 'scope', - 'selected', - 'shape', - 'size', - 'span', - 'src', - 'start', - 'step', - 'style', - 'summary', - 'suppress', - 'tabindex', - 'target', - 'template', - 'title', - 'toppadding', - 'type', - 'unselectable', - 'urn', - 'usemap', - 'valign', - 'value', - 'variable', - 'volume', - 'vrml', - 'vspace', - 'width', - 'wrap', - 'xml:lang', + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "align", + "alt", + "autocomplete", + "autofocus", + "axis", + "background", + "balance", + "bgcolor", + "bgproperties", + "border", + "bordercolor", + "bordercolordark", + "bordercolorlight", + "bottompadding", + "cellpadding", + "cellspacing", + "ch", + "challenge", + "char", + "charoff", + "charset", + "checked", + "choff", + "cite", + "class", + "clear", + "color", + "cols", + "colspan", + "compact", + "contenteditable", + "controls", + "coords", + "data", + "datafld", + "datapagesize", + "datasrc", + "datetime", + "default", + "delay", + "dir", + "disabled", + "draggable", + "dynsrc", + "enctype", + "end", + "face", + "for", + "form", + "frame", + "galleryimg", + "gutter", + "headers", + "height", + "hidden", + "hidefocus", + "high", + "href", + "hreflang", + "hspace", + "icon", + "id", + "inputmode", + "ismap", + "keytype", + "label", + "lang", + "leftspacing", + "list", + "longdesc", + "loop", + "loopcount", + "loopend", + "loopstart", + "low", + "lowsrc", + "max", + "maxlength", + "media", + "method", + "min", + "multiple", + "name", + "nohref", + "noshade", + "nowrap", + "open", + "optimum", + "pattern", + "ping", + "point-size", + "poster", + "pqg", + "preload", + "prompt", + "radiogroup", + "readonly", + "rel", + "repeat-max", + "repeat-min", + "replace", + "required", + "rev", + "rightspacing", + "rows", + "rowspan", + "rules", + "scope", + "selected", + "shape", + "size", + "span", + "src", + "start", + "step", + "style", + "summary", + "suppress", + "tabindex", + "target", + "template", + "title", + "toppadding", + "type", + "unselectable", + "urn", + "usemap", + "valign", + "value", + "variable", + "volume", + "vrml", + "vspace", + "width", + "wrap", + "xml:lang", } unacceptable_elements_with_end_tag = { - 'applet', - 'script', - 'style', + "applet", + "script", + "style", } acceptable_css_properties = { - 'azimuth', - 'background-color', - 'border-bottom-color', - 'border-collapse', - 'border-color', - 'border-left-color', - 'border-right-color', - 'border-top-color', - 'clear', - 'color', - 'cursor', - 'direction', - 'display', - 'elevation', - 'float', - 'font', - 'font-family', - 'font-size', - 'font-style', - 'font-variant', - 'font-weight', - 'height', - 'letter-spacing', - 'line-height', - 'overflow', - 'pause', - 'pause-after', - 'pause-before', - 'pitch', - 'pitch-range', - 'richness', - 'speak', - 'speak-header', - 'speak-numeral', - 'speak-punctuation', - 'speech-rate', - 'stress', - 'text-align', - 'text-decoration', - 'text-indent', - 'unicode-bidi', - 'vertical-align', - 'voice-family', - 'volume', - 'white-space', - 'width', + "azimuth", + "background-color", + "border-bottom-color", + "border-collapse", + "border-color", + "border-left-color", + "border-right-color", + "border-top-color", + "clear", + "color", + "cursor", + "direction", + "display", + "elevation", + "float", + "font", + "font-family", + "font-size", + "font-style", + "font-variant", + "font-weight", + "height", + "letter-spacing", + "line-height", + "overflow", + "pause", + "pause-after", + "pause-before", + "pitch", + "pitch-range", + "richness", + "speak", + "speak-header", + "speak-numeral", + "speak-punctuation", + "speech-rate", + "stress", + "text-align", + "text-decoration", + "text-indent", + "unicode-bidi", + "vertical-align", + "voice-family", + "volume", + "white-space", + "width", } # survey of common keywords found in feeds acceptable_css_keywords = { - '!important', - 'aqua', - 'auto', - 'black', - 'block', - 'blue', - 'bold', - 'both', - 'bottom', - 'brown', - 'center', - 'collapse', - 'dashed', - 'dotted', - 'fuchsia', - 'gray', - 'green', - 'italic', - 'left', - 'lime', - 'maroon', - 'medium', - 'navy', - 'none', - 'normal', - 'nowrap', - 'olive', - 'pointer', - 'purple', - 'red', - 'right', - 'silver', - 'solid', - 'teal', - 'top', - 'transparent', - 'underline', - 'white', - 'yellow', + "!important", + "aqua", + "auto", + "black", + "block", + "blue", + "bold", + "both", + "bottom", + "brown", + "center", + "collapse", + "dashed", + "dotted", + "fuchsia", + "gray", + "green", + "italic", + "left", + "lime", + "maroon", + "medium", + "navy", + "none", + "normal", + "nowrap", + "olive", + "pointer", + "purple", + "red", + "right", + "silver", + "solid", + "teal", + "top", + "transparent", + "underline", + "white", + "yellow", } valid_css_values = re.compile( - r'^(' - r'#[0-9a-f]+' # Hex values - r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values - r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths - r')$' + r"^(" + r"#[0-9a-f]+" # Hex values + r"|rgb\(\d+%?,\d*%?,?\d*%?\)?" # RGB values + r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?" # Sizes/widths + r")$" ) mathml_elements = { - 'annotation', - 'annotation-xml', - 'maction', - 'maligngroup', - 'malignmark', - 'math', - 'menclose', - 'merror', - 'mfenced', - 'mfrac', - 'mglyph', - 'mi', - 'mlabeledtr', - 'mlongdiv', - 'mmultiscripts', - 'mn', - 'mo', - 'mover', - 'mpadded', - 'mphantom', - 'mprescripts', - 'mroot', - 'mrow', - 'ms', - 'mscarries', - 'mscarry', - 'msgroup', - 'msline', - 'mspace', - 'msqrt', - 'msrow', - 'mstack', - 'mstyle', - 'msub', - 'msubsup', - 'msup', - 'mtable', - 'mtd', - 'mtext', - 'mtr', - 'munder', - 'munderover', - 'none', - 'semantics', + "annotation", + "annotation-xml", + "maction", + "maligngroup", + "malignmark", + "math", + "menclose", + "merror", + "mfenced", + "mfrac", + "mglyph", + "mi", + "mlabeledtr", + "mlongdiv", + "mmultiscripts", + "mn", + "mo", + "mover", + "mpadded", + "mphantom", + "mprescripts", + "mroot", + "mrow", + "ms", + "mscarries", + "mscarry", + "msgroup", + "msline", + "mspace", + "msqrt", + "msrow", + "mstack", + "mstyle", + "msub", + "msubsup", + "msup", + "mtable", + "mtd", + "mtext", + "mtr", + "munder", + "munderover", + "none", + "semantics", } mathml_attributes = { - 'accent', - 'accentunder', - 'actiontype', - 'align', - 'alignmentscope', - 'altimg', - 'altimg-height', - 'altimg-valign', - 'altimg-width', - 'alttext', - 'bevelled', - 'charalign', - 'close', - 'columnalign', - 'columnlines', - 'columnspacing', - 'columnspan', - 'columnwidth', - 'crossout', - 'decimalpoint', - 'denomalign', - 'depth', - 'dir', - 'display', - 'displaystyle', - 'edge', - 'encoding', - 'equalcolumns', - 'equalrows', - 'fence', - 'fontstyle', - 'fontweight', - 'form', - 'frame', - 'framespacing', - 'groupalign', - 'height', - 'href', - 'id', - 'indentalign', - 'indentalignfirst', - 'indentalignlast', - 'indentshift', - 'indentshiftfirst', - 'indentshiftlast', - 'indenttarget', - 'infixlinebreakstyle', - 'largeop', - 'length', - 'linebreak', - 'linebreakmultchar', - 'linebreakstyle', - 'lineleading', - 'linethickness', - 'location', - 'longdivstyle', - 'lquote', - 'lspace', - 'mathbackground', - 'mathcolor', - 'mathsize', - 'mathvariant', - 'maxsize', - 'minlabelspacing', - 'minsize', - 'movablelimits', - 'notation', - 'numalign', - 'open', - 'other', - 'overflow', - 'position', - 'rowalign', - 'rowlines', - 'rowspacing', - 'rowspan', - 'rquote', - 'rspace', - 'scriptlevel', - 'scriptminsize', - 'scriptsizemultiplier', - 'selection', - 'separator', - 'separators', - 'shift', - 'side', - 'src', - 'stackalign', - 'stretchy', - 'subscriptshift', - 'superscriptshift', - 'symmetric', - 'voffset', - 'width', - 'xlink:href', - 'xlink:show', - 'xlink:type', - 'xmlns', - 'xmlns:xlink', + "accent", + "accentunder", + "actiontype", + "align", + "alignmentscope", + "altimg", + "altimg-height", + "altimg-valign", + "altimg-width", + "alttext", + "bevelled", + "charalign", + "close", + "columnalign", + "columnlines", + "columnspacing", + "columnspan", + "columnwidth", + "crossout", + "decimalpoint", + "denomalign", + "depth", + "dir", + "display", + "displaystyle", + "edge", + "encoding", + "equalcolumns", + "equalrows", + "fence", + "fontstyle", + "fontweight", + "form", + "frame", + "framespacing", + "groupalign", + "height", + "href", + "id", + "indentalign", + "indentalignfirst", + "indentalignlast", + "indentshift", + "indentshiftfirst", + "indentshiftlast", + "indenttarget", + "infixlinebreakstyle", + "largeop", + "length", + "linebreak", + "linebreakmultchar", + "linebreakstyle", + "lineleading", + "linethickness", + "location", + "longdivstyle", + "lquote", + "lspace", + "mathbackground", + "mathcolor", + "mathsize", + "mathvariant", + "maxsize", + "minlabelspacing", + "minsize", + "movablelimits", + "notation", + "numalign", + "open", + "other", + "overflow", + "position", + "rowalign", + "rowlines", + "rowspacing", + "rowspan", + "rquote", + "rspace", + "scriptlevel", + "scriptminsize", + "scriptsizemultiplier", + "selection", + "separator", + "separators", + "shift", + "side", + "src", + "stackalign", + "stretchy", + "subscriptshift", + "superscriptshift", + "symmetric", + "voffset", + "width", + "xlink:href", + "xlink:show", + "xlink:type", + "xmlns", + "xmlns:xlink", } # svgtiny - foreignObject + linearGradient + radialGradient + stop svg_elements = { - 'a', - 'animate', - 'animateColor', - 'animateMotion', - 'animateTransform', - 'circle', - 'defs', - 'desc', - 'ellipse', - 'font-face', - 'font-face-name', - 'font-face-src', - 'foreignObject', - 'g', - 'glyph', - 'hkern', - 'line', - 'linearGradient', - 'marker', - 'metadata', - 'missing-glyph', - 'mpath', - 'path', - 'polygon', - 'polyline', - 'radialGradient', - 'rect', - 'set', - 'stop', - 'svg', - 'switch', - 'text', - 'title', - 'tspan', - 'use', + "a", + "animate", + "animateColor", + "animateMotion", + "animateTransform", + "circle", + "defs", + "desc", + "ellipse", + "font-face", + "font-face-name", + "font-face-src", + "foreignObject", + "g", + "glyph", + "hkern", + "line", + "linearGradient", + "marker", + "metadata", + "missing-glyph", + "mpath", + "path", + "polygon", + "polyline", + "radialGradient", + "rect", + "set", + "stop", + "svg", + "switch", + "text", + "title", + "tspan", + "use", } # svgtiny + class + opacity + offset + xmlns + xmlns:xlink svg_attributes = { - 'accent-height', - 'accumulate', - 'additive', - 'alphabetic', - 'arabic-form', - 'ascent', - 'attributeName', - 'attributeType', - 'baseProfile', - 'bbox', - 'begin', - 'by', - 'calcMode', - 'cap-height', - 'class', - 'color', - 'color-rendering', - 'content', - 'cx', - 'cy', - 'd', - 'descent', - 'display', - 'dur', - 'dx', - 'dy', - 'end', - 'fill', - 'fill-opacity', - 'fill-rule', - 'font-family', - 'font-size', - 'font-stretch', - 'font-style', - 'font-variant', - 'font-weight', - 'from', - 'fx', - 'fy', - 'g1', - 'g2', - 'glyph-name', - 'gradientUnits', - 'hanging', - 'height', - 'horiz-adv-x', - 'horiz-origin-x', - 'id', - 'ideographic', - 'k', - 'keyPoints', - 'keySplines', - 'keyTimes', - 'lang', - 'marker-end', - 'marker-mid', - 'marker-start', - 'markerHeight', - 'markerUnits', - 'markerWidth', - 'mathematical', - 'max', - 'min', - 'name', - 'offset', - 'opacity', - 'orient', - 'origin', - 'overline-position', - 'overline-thickness', - 'panose-1', - 'path', - 'pathLength', - 'points', - 'preserveAspectRatio', - 'r', - 'refX', - 'refY', - 'repeatCount', - 'repeatDur', - 'requiredExtensions', - 'requiredFeatures', - 'restart', - 'rotate', - 'rx', - 'ry', - 'slope', - 'stemh', - 'stemv', - 'stop-color', - 'stop-opacity', - 'strikethrough-position', - 'strikethrough-thickness', - 'stroke', - 'stroke-dasharray', - 'stroke-dashoffset', - 'stroke-linecap', - 'stroke-linejoin', - 'stroke-miterlimit', - 'stroke-opacity', - 'stroke-width', - 'systemLanguage', - 'target', - 'text-anchor', - 'to', - 'transform', - 'type', - 'u1', - 'u2', - 'underline-position', - 'underline-thickness', - 'unicode', - 'unicode-range', - 'units-per-em', - 'values', - 'version', - 'viewBox', - 'visibility', - 'width', - 'widths', - 'x', - 'x-height', - 'x1', - 'x2', - 'xlink:actuate', - 'xlink:arcrole', - 'xlink:href', - 'xlink:role', - 'xlink:show', - 'xlink:title', - 'xlink:type', - 'xml:base', - 'xml:lang', - 'xml:space', - 'xmlns', - 'xmlns:xlink', - 'y', - 'y1', - 'y2', - 'zoomAndPan', + "accent-height", + "accumulate", + "additive", + "alphabetic", + "arabic-form", + "ascent", + "attributeName", + "attributeType", + "baseProfile", + "bbox", + "begin", + "by", + "calcMode", + "cap-height", + "class", + "color", + "color-rendering", + "content", + "cx", + "cy", + "d", + "descent", + "display", + "dur", + "dx", + "dy", + "end", + "fill", + "fill-opacity", + "fill-rule", + "font-family", + "font-size", + "font-stretch", + "font-style", + "font-variant", + "font-weight", + "from", + "fx", + "fy", + "g1", + "g2", + "glyph-name", + "gradientUnits", + "hanging", + "height", + "horiz-adv-x", + "horiz-origin-x", + "id", + "ideographic", + "k", + "keyPoints", + "keySplines", + "keyTimes", + "lang", + "marker-end", + "marker-mid", + "marker-start", + "markerHeight", + "markerUnits", + "markerWidth", + "mathematical", + "max", + "min", + "name", + "offset", + "opacity", + "orient", + "origin", + "overline-position", + "overline-thickness", + "panose-1", + "path", + "pathLength", + "points", + "preserveAspectRatio", + "r", + "refX", + "refY", + "repeatCount", + "repeatDur", + "requiredExtensions", + "requiredFeatures", + "restart", + "rotate", + "rx", + "ry", + "slope", + "stemh", + "stemv", + "stop-color", + "stop-opacity", + "strikethrough-position", + "strikethrough-thickness", + "stroke", + "stroke-dasharray", + "stroke-dashoffset", + "stroke-linecap", + "stroke-linejoin", + "stroke-miterlimit", + "stroke-opacity", + "stroke-width", + "systemLanguage", + "target", + "text-anchor", + "to", + "transform", + "type", + "u1", + "u2", + "underline-position", + "underline-thickness", + "unicode", + "unicode-range", + "units-per-em", + "values", + "version", + "viewBox", + "visibility", + "width", + "widths", + "x", + "x-height", + "x1", + "x2", + "xlink:actuate", + "xlink:arcrole", + "xlink:href", + "xlink:role", + "xlink:show", + "xlink:title", + "xlink:type", + "xml:base", + "xml:lang", + "xml:space", + "xmlns", + "xmlns:xlink", + "y", + "y1", + "y2", + "zoomAndPan", } svg_attr_map = None svg_elem_map = None acceptable_svg_properties = { - 'fill', - 'fill-opacity', - 'fill-rule', - 'stroke', - 'stroke-linecap', - 'stroke-linejoin', - 'stroke-opacity', - 'stroke-width', + "fill", + "fill-opacity", + "fill-rule", + "stroke", + "stroke-linecap", + "stroke-linejoin", + "stroke-opacity", + "stroke-width", } - def __init__(self, encoding=None, _type='application/xhtml+xml'): + def __init__(self, encoding=None, _type="application/xhtml+xml"): super().__init__(encoding, _type) self.unacceptablestack = 0 @@ -752,17 +754,20 @@ class HTMLSanitizer(BaseHTMLProcessor): self.unacceptablestack += 1 # add implicit namespaces to html5 inline svg/mathml - if self._type.endswith('html'): - if not dict(attrs).get('xmlns'): - if tag == 'svg': - attrs.append(('xmlns', 'http://www.w3.org/2000/svg')) - if tag == 'math': - attrs.append(('xmlns', 'http://www.w3.org/1998/Math/MathML')) + if self._type.endswith("html"): + if not dict(attrs).get("xmlns"): + if tag == "svg": + attrs.append(("xmlns", "http://www.w3.org/2000/svg")) + if tag == "math": + attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML")) # not otherwise acceptable, perhaps it is MathML or SVG? - if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs: + if ( + tag == "math" + and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs + ): self.mathmlOK += 1 - if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs: + if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs: self.svgOK += 1 # chose acceptable attributes based on tag class, else bail @@ -789,20 +794,20 @@ class HTMLSanitizer(BaseHTMLProcessor): # declare xlink namespace, if needed if self.mathmlOK or self.svgOK: - if any((a for a in attrs if a[0].startswith('xlink:'))): - if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs: - attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink')) + if any(a for a in attrs if a[0].startswith("xlink:")): + if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs: + attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink")) clean_attrs = [] for key, value in self.normalize_attrs(attrs): - if key == 'style' and 'style' in acceptable_attributes: + if key == "style" and "style" in acceptable_attributes: clean_value = self.sanitize_style(value) if clean_value: clean_attrs.append((key, clean_value)) elif key in acceptable_attributes: key = keymap.get(key, key) # make sure the uri uses an acceptable uri scheme - if key == 'href': + if key == "href": value = make_safe_absolute_uri(value) clean_attrs.append((key, value)) super().unknown_starttag(tag, clean_attrs) @@ -812,11 +817,11 @@ class HTMLSanitizer(BaseHTMLProcessor): if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 if self.mathmlOK and tag in self.mathml_elements: - if tag == 'math' and self.mathmlOK: + if tag == "math" and self.mathmlOK: self.mathmlOK -= 1 elif self.svgOK and tag in self.svg_elements: tag = self.svg_elem_map.get(tag, tag) - if tag == 'svg' and self.svgOK: + if tag == "svg" and self.svgOK: self.svgOK -= 1 else: return @@ -834,35 +839,43 @@ class HTMLSanitizer(BaseHTMLProcessor): def sanitize_style(self, style): # disallow urls - style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style) # gauntlet - if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): - return '' + if not re.match( + r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", + style, + ): + return "" # This replaced a regexp that used re.match and was prone to # pathological back-tracking. - if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): - return '' + if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip(): + return "" clean = [] for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): if not value: continue if prop.lower() in self.acceptable_css_properties: - clean.append(prop + ': ' + value + ';') - elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']: + clean.append(prop + ": " + value + ";") + elif prop.split("-")[0].lower() in [ + "background", + "border", + "margin", + "padding", + ]: for keyword in value.split(): if ( - keyword not in self.acceptable_css_keywords - and not self.valid_css_values.match(keyword) + keyword not in self.acceptable_css_keywords + and not self.valid_css_values.match(keyword) ): break else: - clean.append(prop + ': ' + value + ';') + clean.append(prop + ": " + value + ";") elif self.svgOK and prop.lower() in self.acceptable_svg_properties: - clean.append(prop + ': ' + value + ';') + clean.append(prop + ": " + value + ";") - return ' '.join(clean) + return " ".join(clean) def parse_comment(self, i, report=1): ret = super().parse_comment(i, report) @@ -870,7 +883,7 @@ class HTMLSanitizer(BaseHTMLProcessor): return ret # if ret == -1, this may be a malicious attempt to circumvent # sanitization, or a page-destroying unclosed comment - match = re.compile(r'--[^>]*>').search(self.rawdata, i+4) + match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4) if match: return match.end() # unclosed comment; deliberately fail to handle_data() @@ -879,20 +892,20 @@ class HTMLSanitizer(BaseHTMLProcessor): def sanitize_html(html_source, encoding, _type): p = HTMLSanitizer(encoding, _type) - html_source = html_source.replace(' -RE_ENTITY_PATTERN = re.compile(br'^\s*]*?)>', re.MULTILINE) +RE_ENTITY_PATTERN = re.compile(rb"^\s*]*?)>", re.MULTILINE) # Match XML DOCTYPE declarations. # Example: -RE_DOCTYPE_PATTERN = re.compile(br'^\s*]*?)>', re.MULTILINE) +RE_DOCTYPE_PATTERN = re.compile(rb"^\s*]*?)>", re.MULTILINE) # Match safe entity declarations. # This will allow hexadecimal character references through, @@ -900,51 +913,62 @@ RE_DOCTYPE_PATTERN = re.compile(br'^\s*]*?)>', re.MULTILINE) # Example: cubed "³" # Example: copyright "(C)" # Forbidden: explode1 "&explode2;&explode2;" -RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"') +RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"') -def replace_doctype(data): - """Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) +def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]: + """Strip and replaces the DOCTYPE. - rss_version may be 'rss091n' or None - stripped_data is the same XML document with a replaced DOCTYPE + One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration. + Therefore, this function must identify that version while replacing the DOCTYPE. + + As a convenience to the loose XML parser, entities are pre-computed and returned. + + The tuple that is returned has the following values, in order: + + 1. The version extracted from the XML DOCTYPE. + The value will either be "rss091n" or None. + 2. Binary XML content with a replaced DOCTYPE. + 3. A dictionary of entities and replacements. """ + # Verify this looks like an XML feed. + if not re.match(rb"^\s*<", data): + return None, data, {} + # Divide the document into two groups by finding the location # of the first element that doesn't begin with '\n\n]>' + replacement = ( + b"\n\n]>" + ) data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data # Precompute the safe entities for the loose parser. - safe_entities = { - k.decode('utf-8'): v.decode('utf-8') + entities = { + k.decode("utf-8"): v.decode("utf-8") for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement) } - return version, data, safe_entities + return version, data, entities diff --git a/lib/feedparser/sgml.py b/lib/feedparser/sgml.py index b5dbdbc3..21735431 100644 --- a/lib/feedparser/sgml.py +++ b/lib/feedparser/sgml.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -27,20 +27,20 @@ import re -import sgmllib # type: ignore[import] +import sgmllib3k as sgmllib __all__ = [ - 'sgmllib', - 'charref', - 'tagfind', - 'attrfind', - 'entityref', - 'incomplete', - 'interesting', - 'shorttag', - 'shorttagopen', - 'starttagopen', - 'endbracket', + "sgmllib", + "charref", + "tagfind", + "attrfind", + "entityref", + "incomplete", + "interesting", + "shorttag", + "shorttagopen", + "starttagopen", + "endbracket", ] # sgmllib defines a number of module-level regular expressions that are @@ -49,20 +49,20 @@ __all__ = [ # names, and the compiled code objects of several sgmllib.SGMLParser # methods are copied into _BaseHTMLProcessor so that they execute in # feedparser's scope instead of sgmllib's scope. -charref = re.compile(r'&#(\d+|[xX][0-9a-fA-F]+);') -tagfind = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*') +charref = re.compile(r"&#(\d+|[xX][0-9a-fA-F]+);") +tagfind = re.compile(r"[a-zA-Z][-_.:a-zA-Z0-9]*") attrfind = re.compile( r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*""" r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?""" ) # Unfortunately, these must be copied over to prevent NameError exceptions -entityref = sgmllib.entityref -incomplete = sgmllib.incomplete -interesting = sgmllib.interesting -shorttag = sgmllib.shorttag -shorttagopen = sgmllib.shorttagopen -starttagopen = sgmllib.starttagopen +entityref = sgmllib.SGMLParser.entityref +incomplete = sgmllib.SGMLParser.incomplete +interesting = sgmllib.SGMLParser.interesting +shorttag = sgmllib.SGMLParser.shorttag +shorttagopen = sgmllib.SGMLParser.shorttagopen +starttagopen = sgmllib.SGMLParser.starttagopen class _EndBracketRegEx: @@ -70,12 +70,12 @@ class _EndBracketRegEx: # Overriding the built-in sgmllib.endbracket regex allows the # parser to find angle brackets embedded in element attributes. self.endbracket = re.compile( - r'(' + r"(" r"""[^'"<>]""" r"""|"[^"]*"(?=>|/|\s|\w+=)""" r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])""" r"""|.*?(?=[<>]""" - r')' + r")" ) def search(self, target, index=0): diff --git a/lib/feedparser/urls.py b/lib/feedparser/urls.py index 623f030a..2579443b 100644 --- a/lib/feedparser/urls.py +++ b/lib/feedparser/urls.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -37,103 +37,116 @@ from .html import BaseHTMLProcessor # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # Many more will likely need to be added! ACCEPTABLE_URI_SCHEMES = ( - 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', - 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', - 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', - 'wais', + "file", + "ftp", + "gopher", + "h323", + "hdl", + "http", + "https", + "imap", + "magnet", + "mailto", + "mms", + "news", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "snews", + "svn", + "svn+ssh", + "telnet", + "wais", # Additional common-but-unofficial schemes - 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', - 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', + "aim", + "callto", + "cvs", + "facetime", + "feed", + "git", + "gtalk", + "irc", + "ircs", + "irc6", + "itms", + "mms", + "msnim", + "skype", + "ssh", + "smb", + "svn", + "ymsg", ) -_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') +_urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)") def _urljoin(base, uri): - uri = _urifixer.sub(r'\1\3', uri) + uri = _urifixer.sub(r"\1\3", uri) try: uri = urllib.parse.urljoin(base, uri) except ValueError: - uri = '' + uri = "" return uri -def convert_to_idn(url): - """Convert a URL to IDN notation""" - # this function should only be called with a unicode string - # strategy: if the host cannot be encoded in ascii, then - # it'll be necessary to encode it in idn form - parts = list(urllib.parse.urlsplit(url)) - try: - parts[1].encode('ascii') - except UnicodeEncodeError: - # the url needs to be converted to idn notation - host = parts[1].rsplit(':', 1) - newhost = [] - port = '' - if len(host) == 2: - port = host.pop() - for h in host[0].split('.'): - newhost.append(h.encode('idna').decode('utf-8')) - parts[1] = '.'.join(newhost) - if port: - parts[1] += ':' + port - return urllib.parse.urlunsplit(parts) - else: - return url - - def make_safe_absolute_uri(base, rel=None): # bail if ACCEPTABLE_URI_SCHEMES is empty if not ACCEPTABLE_URI_SCHEMES: - return _urljoin(base, rel or '') + return _urljoin(base, rel or "") if not base: - return rel or '' + return rel or "" if not rel: try: scheme = urllib.parse.urlparse(base)[0] except ValueError: - return '' + return "" if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: return base - return '' + return "" uri = _urljoin(base, rel) - if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: - return '' + if uri.strip().split(":", 1)[0] not in ACCEPTABLE_URI_SCHEMES: + return "" return uri class RelativeURIResolver(BaseHTMLProcessor): relative_uris = { - ('a', 'href'), - ('applet', 'codebase'), - ('area', 'href'), - ('audio', 'src'), - ('blockquote', 'cite'), - ('body', 'background'), - ('del', 'cite'), - ('form', 'action'), - ('frame', 'longdesc'), - ('frame', 'src'), - ('iframe', 'longdesc'), - ('iframe', 'src'), - ('head', 'profile'), - ('img', 'longdesc'), - ('img', 'src'), - ('img', 'usemap'), - ('input', 'src'), - ('input', 'usemap'), - ('ins', 'cite'), - ('link', 'href'), - ('object', 'classid'), - ('object', 'codebase'), - ('object', 'data'), - ('object', 'usemap'), - ('q', 'cite'), - ('script', 'src'), - ('source', 'src'), - ('video', 'poster'), - ('video', 'src'), + ("a", "href"), + ("applet", "codebase"), + ("area", "href"), + ("audio", "src"), + ("blockquote", "cite"), + ("body", "background"), + ("del", "cite"), + ("form", "action"), + ("frame", "longdesc"), + ("frame", "src"), + ("iframe", "longdesc"), + ("iframe", "src"), + ("head", "profile"), + ("img", "longdesc"), + ("img", "src"), + ("img", "usemap"), + ("input", "src"), + ("input", "usemap"), + ("ins", "cite"), + ("link", "href"), + ("object", "classid"), + ("object", "codebase"), + ("object", "data"), + ("object", "usemap"), + ("q", "cite"), + ("script", "src"), + ("source", "src"), + ("video", "poster"), + ("video", "src"), } def __init__(self, baseuri, encoding, _type): @@ -145,8 +158,14 @@ class RelativeURIResolver(BaseHTMLProcessor): def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) - attrs = [(key, ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value) for key, value in attrs] - super(RelativeURIResolver, self).unknown_starttag(tag, attrs) + attrs = [ + ( + key, + ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value, + ) + for key, value in attrs + ] + super().unknown_starttag(tag, attrs) def resolve_relative_uris(html_source, base_uri, encoding, type_): diff --git a/lib/feedparser/util.py b/lib/feedparser/util.py index 9e1516cf..bcf0f61e 100644 --- a/lib/feedparser/util.py +++ b/lib/feedparser/util.py @@ -1,4 +1,4 @@ -# Copyright 2010-2022 Kurt McKee +# Copyright 2010-2023 Kurt McKee # Copyright 2002-2008 Mark Pilgrim # All rights reserved. # @@ -30,22 +30,22 @@ import warnings class FeedParserDict(dict): keymap = { - 'channel': 'feed', - 'items': 'entries', - 'guid': 'id', - 'date': 'updated', - 'date_parsed': 'updated_parsed', - 'description': ['summary', 'subtitle'], - 'description_detail': ['summary_detail', 'subtitle_detail'], - 'url': ['href'], - 'modified': 'updated', - 'modified_parsed': 'updated_parsed', - 'issued': 'published', - 'issued_parsed': 'published_parsed', - 'copyright': 'rights', - 'copyright_detail': 'rights_detail', - 'tagline': 'subtitle', - 'tagline_detail': 'subtitle_detail', + "channel": "feed", + "items": "entries", + "guid": "id", + "date": "updated", + "date_parsed": "updated_parsed", + "description": ["summary", "subtitle"], + "description_detail": ["summary_detail", "subtitle_detail"], + "url": ["href"], + "modified": "updated", + "modified_parsed": "updated_parsed", + "issued": "published", + "issued_parsed": "published_parsed", + "copyright": "rights", + "copyright_detail": "rights_detail", + "tagline": "subtitle", + "tagline_detail": "subtitle_detail", } def __getitem__(self, key, _stacklevel=2): @@ -53,28 +53,29 @@ class FeedParserDict(dict): :return: A :class:`FeedParserDict`. """ - if key == 'category': + if key == "category": try: - return dict.__getitem__(self, 'tags')[0]['term'] + return dict.__getitem__(self, "tags")[0]["term"] except IndexError: raise KeyError("object doesn't have key 'category'") - elif key == 'enclosures': + elif key == "enclosures": return [ - FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel']) - for link in dict.__getitem__(self, 'links') - if link['rel'] == 'enclosure' + FeedParserDict( + [(name, value) for (name, value) in link.items() if name != "rel"] + ) + for link in dict.__getitem__(self, "links") + if link["rel"] == "enclosure" ] - elif key == 'license': - for link in dict.__getitem__(self, 'links'): - if link['rel'] == 'license' and 'href' in link: - return link['href'] - elif key == 'updated': + elif key == "license": + for link in dict.__getitem__(self, "links"): + if link["rel"] == "license" and "href" in link: + return link["href"] + elif key == "updated": # Temporarily help developers out by keeping the old # broken behavior that was reported in issue 310. # This fix was proposed in issue 328. - if ( - not dict.__contains__(self, 'updated') - and dict.__contains__(self, 'published') + if not dict.__contains__(self, "updated") and dict.__contains__( + self, "published" ): warnings.warn( "To avoid breaking existing software while " @@ -85,12 +86,11 @@ class FeedParserDict(dict): DeprecationWarning, stacklevel=_stacklevel, ) - return dict.__getitem__(self, 'published') - return dict.__getitem__(self, 'updated') - elif key == 'updated_parsed': - if ( - not dict.__contains__(self, 'updated_parsed') - and dict.__contains__(self, 'published_parsed') + return dict.__getitem__(self, "published") + return dict.__getitem__(self, "updated") + elif key == "updated_parsed": + if not dict.__contains__(self, "updated_parsed") and dict.__contains__( + self, "published_parsed" ): warnings.warn( "To avoid breaking existing software while " @@ -101,8 +101,8 @@ class FeedParserDict(dict): DeprecationWarning, stacklevel=_stacklevel, ) - return dict.__getitem__(self, 'published_parsed') - return dict.__getitem__(self, 'updated_parsed') + return dict.__getitem__(self, "published_parsed") + return dict.__getitem__(self, "updated_parsed") else: realkey = self.keymap.get(key, key) if isinstance(realkey, list): @@ -114,7 +114,7 @@ class FeedParserDict(dict): return dict.__getitem__(self, key) def __contains__(self, key): - if key in ('updated', 'updated_parsed'): + if key in ("updated", "updated_parsed"): # Temporarily help developers out by keeping the old # broken behavior that was reported in issue 310. # This fix was proposed in issue 328.