Merge branch 'feature/UpdateFeedparser' into dev

This commit is contained in:
JackDandy 2023-04-27 12:37:26 +01:00
commit 0794ca330f
33 changed files with 3280 additions and 2349 deletions

View file

@ -2,6 +2,7 @@
* Update attr 22.2.0 (a9960de) to 22.2.0 (683d056) * Update attr 22.2.0 (a9960de) to 22.2.0 (683d056)
* Update diskcache 5.4.0 (1cb1425) to 5.6.1 (4d30686) * Update diskcache 5.4.0 (1cb1425) to 5.6.1 (4d30686)
* Update feedparser 6.0.10 (5fcb3ae) to 6.0.10 (6d032b8)
* Update filelock 3.9.0 (ce3e891) to 3.11.0 (d3241b9) * Update filelock 3.9.0 (ce3e891) to 3.11.0 (d3241b9)
* Update Msgpack 1.0.4 (b5acfd5) to 1.0.5 (0516c2c) * Update Msgpack 1.0.4 (b5acfd5) to 1.0.5 (0516c2c)
* Update Requests library 2.28.1 (ec553c2) to 2.29.0 (87d63de) * Update Requests library 2.28.1 (ec553c2) to 2.29.0 (87d63de)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,12 +27,18 @@
from .api import parse from .api import parse
from .datetimes import registerDateHandler from .datetimes import registerDateHandler
from .exceptions import * from .exceptions import (
CharacterEncodingOverride,
CharacterEncodingUnknown,
FeedparserError,
NonXMLContentType,
UndeclaredNamespace,
)
from .util import FeedParserDict from .util import FeedParserDict
__author__ = 'Kurt McKee <contactme@kurtmckee.org>' __author__ = "Kurt McKee <contactme@kurtmckee.org>"
__license__ = 'BSD 2-clause' __license__ = "BSD 2-clause"
__version__ = '6.0.10' __version__ = "6.0.10"
# HTTP "User-Agent" header to send to servers when downloading feeds. # HTTP "User-Agent" header to send to servers when downloading feeds.
# If you are embedding feedparser in a larger application, you should # If you are embedding feedparser in a larger application, you should
@ -46,3 +52,20 @@ RESOLVE_RELATIVE_URIS = 1
# If you want feedparser to automatically sanitize all potentially unsafe # If you want feedparser to automatically sanitize all potentially unsafe
# HTML content, set this to 1. # HTML content, set this to 1.
SANITIZE_HTML = 1 SANITIZE_HTML = 1
# If you want feedparser to use only a prefix of the feed to detect encodings
# (uses less memory), set this to 1.
OPTIMISTIC_ENCODING_DETECTION = 1
__all__ = (
"parse",
"registerDateHandler",
"FeedParserDict",
"FeedparserError",
"CharacterEncodingOverride",
"CharacterEncodingUnknown",
"NonXMLContentType",
"UndeclaredNamespace",
)

View file

@ -1,5 +1,5 @@
# The public API for feedparser # The public API for feedparser
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,29 +26,23 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
import datetime
import io import io
import time
from typing import Dict, List, Union
import urllib.error import urllib.error
import urllib.parse import urllib.parse
import xml.sax import xml.sax
from typing import IO, Dict, Optional, Union
import sgmllib3k as sgmllib
from .datetimes import registerDateHandler, _parse_date
from .encodings import convert_to_utf8
from .html import BaseHTMLProcessor
from . import http from . import http
from .encodings import MissingEncoding, convert_file_to_utf8
from .html import BaseHTMLProcessor
from .mixin import XMLParserMixin from .mixin import XMLParserMixin
from .parsers.json import JSONParser
from .parsers.loose import LooseXMLParser from .parsers.loose import LooseXMLParser
from .parsers.strict import StrictXMLParser from .parsers.strict import StrictXMLParser
from .parsers.json import JSONParser
from .sanitizer import replace_doctype from .sanitizer import replace_doctype
from .urls import convert_to_idn, make_safe_absolute_uri from .urls import make_safe_absolute_uri
from .util import FeedParserDict from .util import FeedParserDict
# List of preferred XML parsers, by SAX driver name. These will be tried first, # List of preferred XML parsers, by SAX driver name. These will be tried first,
# but if they're not installed, Python will keep searching through its own list # but if they're not installed, Python will keep searching through its own list
# of pre-installed parsers until it finds one that supports everything we need. # of pre-installed parsers until it finds one that supports everything we need.
@ -57,27 +51,30 @@ PREFERRED_XML_PARSERS = ["drv_libxml2"]
_XML_AVAILABLE = True _XML_AVAILABLE = True
SUPPORTED_VERSIONS = { SUPPORTED_VERSIONS = {
'': 'unknown', "": "unknown",
'rss090': 'RSS 0.90', "rss090": "RSS 0.90",
'rss091n': 'RSS 0.91 (Netscape)', "rss091n": "RSS 0.91 (Netscape)",
'rss091u': 'RSS 0.91 (Userland)', "rss091u": "RSS 0.91 (Userland)",
'rss092': 'RSS 0.92', "rss092": "RSS 0.92",
'rss093': 'RSS 0.93', "rss093": "RSS 0.93",
'rss094': 'RSS 0.94', "rss094": "RSS 0.94",
'rss20': 'RSS 2.0', "rss20": "RSS 2.0",
'rss10': 'RSS 1.0', "rss10": "RSS 1.0",
'rss': 'RSS (unknown version)', "rss": "RSS (unknown version)",
'atom01': 'Atom 0.1', "atom01": "Atom 0.1",
'atom02': 'Atom 0.2', "atom02": "Atom 0.2",
'atom03': 'Atom 0.3', "atom03": "Atom 0.3",
'atom10': 'Atom 1.0', "atom10": "Atom 1.0",
'atom': 'Atom (unknown version)', "atom": "Atom (unknown version)",
'cdf': 'CDF', "cdf": "CDF",
'json1': 'JSON feed 1', "json1": "JSON feed 1",
} }
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result): def _open_resource(
url_file_stream_or_string,
result,
):
"""URL, filename, or string --> stream """URL, filename, or string --> stream
This function lets you define parsers that take any input source This function lets you define parsers that take any input source
@ -86,43 +83,44 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
to have all the basic stdio read methods (read, readline, readlines). to have all the basic stdio read methods (read, readline, readlines).
Just .close() the object when you're done with it. Just .close() the object when you're done with it.
If the etag argument is supplied, it will be used as the value of an :return: A seekable, readable file object.
If-None-Match request header.
If the modified argument is supplied, it can be a tuple of 9 integers
(as returned by gmtime() in the standard Python time module) or a date
string in any format supported by feedparser. Regardless, it MUST
be in GMT (Greenwich Mean Time). It will be reformatted into an
RFC 1123-compliant date and used as the value of an If-Modified-Since
request header.
If the agent argument is supplied, it will be used as the value of a
User-Agent request header.
If the referrer argument is supplied, it will be used as the value of a
Referer[sic] request header.
If handlers is supplied, it is a list of handlers used to build a
urllib2 opener.
if request_headers is supplied it is a dictionary of HTTP request headers
that will override the values generated by FeedParser.
:return: A bytes object.
""" """
if hasattr(url_file_stream_or_string, 'read'): # Some notes on the history of the implementation of _open_resource().
return url_file_stream_or_string.read() #
# parse() might need to go over the feed content twice:
# if the strict parser fails, it tries again with the loose parser.
#
# In 5.2.0, this returned an open file, to be read() by parse().
# By 6.0.8, this returned bytes directly.
#
# Since #296 (>6.0.8), this once again returns an open file
# (to reduce memory usage, see convert_file_to_utf8() for details).
# However, to accommodate parse() needing the content twice,
# the returned file is guaranteed to be seekable.
# (If the underlying resource is not seekable,
# the content is read and wrapped in a io.BytesIO/StringIO.)
if isinstance(url_file_stream_or_string, str) \ if callable(getattr(url_file_stream_or_string, "read", None)):
and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): if callable(getattr(url_file_stream_or_string, "seekable", None)):
return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) if url_file_stream_or_string.seekable():
return url_file_stream_or_string
return _to_in_memory_file(url_file_stream_or_string.read())
looks_like_url = isinstance(
url_file_stream_or_string, str
) and urllib.parse.urlparse(url_file_stream_or_string)[0] in (
"http",
"https",
)
if looks_like_url:
data = http.get(url_file_stream_or_string, result)
return io.BytesIO(data)
# try to open with native open function (if url_file_stream_or_string is a filename) # try to open with native open function (if url_file_stream_or_string is a filename)
try: try:
with open(url_file_stream_or_string, 'rb') as f: return open(url_file_stream_or_string, "rb")
data = f.read() except (OSError, TypeError, ValueError):
except (IOError, UnicodeEncodeError, TypeError, ValueError):
# if url_file_stream_or_string is a str object that # if url_file_stream_or_string is a str object that
# cannot be converted to the encoding returned by # cannot be converted to the encoding returned by
# sys.getfilesystemencoding(), a UnicodeEncodeError # sys.getfilesystemencoding(), a UnicodeEncodeError
@ -131,33 +129,32 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
# (such as an XML document encoded in UTF-32), TypeError will # (such as an XML document encoded in UTF-32), TypeError will
# be thrown. # be thrown.
pass pass
else:
return data
# treat url_file_stream_or_string as string # treat url_file_stream_or_string as bytes/string
if not isinstance(url_file_stream_or_string, bytes): return _to_in_memory_file(url_file_stream_or_string)
return url_file_stream_or_string.encode('utf-8')
return url_file_stream_or_string
def _to_in_memory_file(data):
if isinstance(data, str):
return io.StringIO(data)
else:
return io.BytesIO(data)
class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor): class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
pass pass
class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler): class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler):
pass pass
def parse( def parse(
url_file_stream_or_string, url_file_stream_or_string,
etag: str = None, response_headers: Optional[Dict[str, str]] = None,
modified: Union[str, datetime.datetime, time.struct_time] = None, resolve_relative_uris: Optional[bool] = None,
agent: str = None, sanitize_html: Optional[bool] = None,
referrer: str = None, optimistic_encoding_detection: Optional[bool] = None,
handlers: List = None,
request_headers: Dict[str, str] = None,
response_headers: Dict[str, str] = None,
resolve_relative_uris: bool = None,
sanitize_html: bool = None,
) -> FeedParserDict: ) -> FeedParserDict:
"""Parse a feed from a URL, file, stream, or string. """Parse a feed from a URL, file, stream, or string.
@ -174,20 +171,6 @@ def parse(
When a URL is not passed the feed location to use in relative URL When a URL is not passed the feed location to use in relative URL
resolution should be passed in the ``Content-Location`` response header resolution should be passed in the ``Content-Location`` response header
(see ``response_headers`` below). (see ``response_headers`` below).
:param etag:
HTTP ``ETag`` request header.
:param modified:
HTTP ``Last-Modified`` request header.
:param agent:
HTTP ``User-Agent`` request header, which defaults to
the value of :data:`feedparser.USER_AGENT`.
:param referrer:
HTTP ``Referer`` [sic] request header.
:param handlers:
A list of handlers that will be passed to urllib2.
:param request_headers:
A mapping of HTTP header name to HTTP header value to add to the
request, overriding internally generated values.
:param response_headers: :param response_headers:
A mapping of HTTP header name to HTTP header value. Multiple values may A mapping of HTTP header name to HTTP header value. Multiple values may
be joined with a comma. If a HTTP request was made, these headers be joined with a comma. If a HTTP request was made, these headers
@ -201,20 +184,14 @@ def parse(
Should feedparser skip HTML sanitization? Only disable this if you know Should feedparser skip HTML sanitization? Only disable this if you know
what you are doing! Defaults to the value of what you are doing! Defaults to the value of
:data:`feedparser.SANITIZE_HTML`, which is ``True``. :data:`feedparser.SANITIZE_HTML`, which is ``True``.
:param optimistic_encoding_detection:
Should feedparser use only a prefix of the feed to detect encodings
(uses less memory, but the wrong encoding may be detected in rare cases).
Defaults to the value of
:data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.
""" """
# Avoid a cyclic import.
if not agent:
import feedparser
agent = feedparser.USER_AGENT
if sanitize_html is None:
import feedparser
sanitize_html = bool(feedparser.SANITIZE_HTML)
if resolve_relative_uris is None:
import feedparser
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
result = FeedParserDict( result = FeedParserDict(
bozo=False, bozo=False,
entries=[], entries=[],
@ -223,50 +200,110 @@ def parse(
) )
try: try:
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result) file = _open_resource(
url_file_stream_or_string,
result,
)
except urllib.error.URLError as error: except urllib.error.URLError as error:
result.update({ result.update(
'bozo': True, {
'bozo_exception': error, "bozo": True,
}) "bozo_exception": error,
}
)
return result return result
if not data: # at this point, the file is guaranteed to be seekable;
# we read 1 byte/character to see if it's empty and return early
# (this preserves the behavior in 6.0.8)
initial_file_offset = file.tell()
if not file.read(1):
return result return result
file.seek(initial_file_offset)
# overwrite existing headers using response_headers # overwrite existing headers using response_headers
result['headers'].update(response_headers or {}) result["headers"].update(response_headers or {})
data = convert_to_utf8(result['headers'], data, result) try:
use_json_parser = result['content-type'] == 'application/json' _parse_file_inplace(
use_strict_parser = result['encoding'] and True or False file,
result,
resolve_relative_uris=resolve_relative_uris,
sanitize_html=sanitize_html,
optimistic_encoding_detection=optimistic_encoding_detection,
)
finally:
if not hasattr(url_file_stream_or_string, "read"):
# the file does not come from the user, close it
file.close()
if not use_json_parser: return result
result['version'], data, entities = replace_doctype(data)
def _parse_file_inplace(
file: Union[IO[bytes], IO[str]],
result: dict,
*,
resolve_relative_uris: Optional[bool] = None,
sanitize_html: Optional[bool] = None,
optimistic_encoding_detection: Optional[bool] = None,
) -> None:
# Avoid a cyclic import.
import feedparser
if sanitize_html is None:
sanitize_html = bool(feedparser.SANITIZE_HTML)
if resolve_relative_uris is None:
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
if optimistic_encoding_detection is None:
optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION)
stream_factory = convert_file_to_utf8(
result["headers"], file, result, optimistic_encoding_detection
)
# We're done with file, all access must happen through stream_factory.
del file
# Some notes about the stream_factory.get_{text,binary}_file() methods:
#
# Calling them a second time will raise io.UnsupportedOperation
# if the underlying file was not seekable.
#
# Calling close() on the returned file is ignored
# (that is, the underlying file is *not* closed),
# because the SAX parser closes the file when done;
# we don't want that, since we might try again with the loose parser.
use_json_parser = False
if result["content-type"] in {"application/json", "application/feed+json"}:
use_json_parser = True
use_strict_parser = bool(result["encoding"])
result["version"], stream_factory.prefix, entities = replace_doctype(
stream_factory.prefix
)
# Ensure that baseuri is an absolute URI using an acceptable URI scheme. # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
contentloc = result['headers'].get('content-location', '') contentloc = result["headers"].get("content-location", "")
href = result.get('href', '') href = result.get("href", "")
baseuri = make_safe_absolute_uri(href, contentloc) or make_safe_absolute_uri(contentloc) or href baseuri = (
make_safe_absolute_uri(href, contentloc)
or make_safe_absolute_uri(contentloc)
or href
)
baselang = result['headers'].get('content-language', None) baselang = result["headers"].get("content-language", None)
if isinstance(baselang, bytes) and baselang is not None: if isinstance(baselang, bytes) and baselang is not None:
baselang = baselang.decode('utf-8', 'ignore') baselang = baselang.decode("utf-8", "ignore")
if not _XML_AVAILABLE: if not _XML_AVAILABLE:
use_strict_parser = False use_strict_parser = False
feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser] feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
if use_json_parser:
result['version'] = None if use_strict_parser and not use_json_parser:
feed_parser = JSONParser(baseuri, baselang, 'utf-8')
try:
feed_parser.feed(data)
except Exception as e:
result['bozo'] = 1
result['bozo_exception'] = e
elif use_strict_parser:
# Initialize the SAX parser. # Initialize the SAX parser.
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8') feed_parser = StrictFeedParser(baseuri, baselang, "utf-8")
feed_parser.resolve_relative_uris = resolve_relative_uris feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html feed_parser.sanitize_html = sanitize_html
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
@ -279,27 +316,62 @@ def parse(
saxparser.setContentHandler(feed_parser) saxparser.setContentHandler(feed_parser)
saxparser.setErrorHandler(feed_parser) saxparser.setErrorHandler(feed_parser)
source = xml.sax.xmlreader.InputSource() source = xml.sax.xmlreader.InputSource()
source.setByteStream(io.BytesIO(data))
# If an encoding was detected, decode the file on the fly;
# otherwise, pass it as-is and let the SAX parser deal with it.
try:
source.setCharacterStream(stream_factory.get_text_file())
except MissingEncoding:
source.setByteStream(stream_factory.get_binary_file())
try: try:
saxparser.parse(source) saxparser.parse(source)
except xml.sax.SAXException as e: except xml.sax.SAXException as e:
result['bozo'] = 1 result["bozo"] = 1
result['bozo_exception'] = feed_parser.exc or e result["bozo_exception"] = feed_parser.exc or e
use_strict_parser = False use_strict_parser = False
# The loose XML parser will be tried if the JSON parser was not used, # The loose XML parser will be tried if the strict XML parser was not used
# and if the strict XML parser was not used (or if it failed). # (or if it failed to parse the feed).
if not use_json_parser and not use_strict_parser: if not use_strict_parser and not use_json_parser:
feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities) feed_parser = LooseFeedParser(baseuri, baselang, "utf-8", entities)
feed_parser.resolve_relative_uris = resolve_relative_uris feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html feed_parser.sanitize_html = sanitize_html
feed_parser.feed(data.decode('utf-8', 'replace'))
result['feed'] = feed_parser.feeddata # If an encoding was detected, use it; otherwise, assume utf-8 and do your best.
result['entries'] = feed_parser.entries # Will raise io.UnsupportedOperation if the underlying file is not seekable.
result['version'] = result['version'] or feed_parser.version data = stream_factory.get_text_file("utf-8", "replace").read()
# As of 6.0.8, LooseFeedParser.feed() can be called exactly once
# with the entire data (it does some re.sub() and str.replace() on it).
#
# SGMLParser (of which LooseFeedParser is a subclass)
# *can* be fed in a streaming fashion,
# by calling feed() repeatedly with chunks of text.
#
# When/if LooseFeedParser will support being fed chunks,
# replace the read() call above with read(size)/feed() calls in a loop.
feed_parser.feed(data)
# If parsing with the loose XML parser resulted in no information,
# flag that the JSON parser should be tried.
if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version):
use_json_parser = True
if use_json_parser:
result["version"] = None
feed_parser = JSONParser(baseuri, baselang, "utf-8")
try:
feed_parser.feed(stream_factory.get_file())
except Exception as e:
result["bozo"] = 1
result["bozo_exception"] = e
result["feed"] = feed_parser.feeddata
result["entries"] = feed_parser.entries
result["version"] = result["version"] or feed_parser.version
if isinstance(feed_parser, JSONParser): if isinstance(feed_parser, JSONParser):
result['namespaces'] = {} result["namespaces"] = {}
else: else:
result['namespaces'] = feed_parser.namespaces_in_use result["namespaces"] = feed_parser.namespaces_in_use
return result

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,11 +27,12 @@
from time import struct_time from time import struct_time
from typing import Callable, List, Optional from typing import Callable, List, Optional
from .asctime import _parse_date_asctime from .asctime import _parse_date_asctime
from .greek import _parse_date_greek from .greek import _parse_date_greek
from .hungarian import _parse_date_hungarian from .hungarian import _parse_date_hungarian
from .iso8601 import _parse_date_iso8601 from .iso8601 import _parse_date_iso8601
from .korean import _parse_date_onblog, _parse_date_nate from .korean import _parse_date_nate, _parse_date_onblog
from .perforce import _parse_date_perforce from .perforce import _parse_date_perforce
from .rfc822 import _parse_date_rfc822 from .rfc822 import _parse_date_rfc822
from .w3dtf import _parse_date_w3dtf from .w3dtf import _parse_date_w3dtf

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -28,18 +28,18 @@
from .rfc822 import _parse_date_rfc822 from .rfc822 import _parse_date_rfc822
_months = [ _months = [
'jan', "jan",
'feb', "feb",
'mar', "mar",
'apr', "apr",
'may', "may",
'jun', "jun",
'jul', "jul",
'aug', "aug",
'sep', "sep",
'oct', "oct",
'nov', "nov",
'dec', "dec",
] ]
@ -59,13 +59,22 @@ def _parse_date_asctime(dt):
# Insert a GMT timezone, if needed. # Insert a GMT timezone, if needed.
if len(parts) == 5: if len(parts) == 5:
parts.insert(4, '+0000') parts.insert(4, "+0000")
# Exit if there are not six parts. # Exit if there are not six parts.
if len(parts) != 6: if len(parts) != 6:
return None return None
# Reassemble the parts in an RFC822-compatible order and parse them. # Reassemble the parts in an RFC822-compatible order and parse them.
return _parse_date_rfc822(' '.join([ return _parse_date_rfc822(
parts[0], parts[2], parts[1], parts[5], parts[3], parts[4], " ".join(
])) [
parts[0],
parts[2],
parts[1],
parts[5],
parts[3],
parts[4],
]
)
)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -31,38 +31,40 @@ from .rfc822 import _parse_date_rfc822
# Unicode strings for Greek date strings # Unicode strings for Greek date strings
_greek_months = { _greek_months = {
'\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 "\u0399\u03b1\u03bd": "Jan", # c9e1ed in iso-8859-7
'\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 "\u03a6\u03b5\u03b2": "Feb", # d6e5e2 in iso-8859-7
'\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 "\u039c\u03ac\u03ce": "Mar", # ccdcfe in iso-8859-7
'\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 "\u039c\u03b1\u03ce": "Mar", # cce1fe in iso-8859-7
'\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 "\u0391\u03c0\u03c1": "Apr", # c1f0f1 in iso-8859-7
'\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 "\u039c\u03ac\u03b9": "May", # ccdce9 in iso-8859-7
'\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 "\u039c\u03b1\u03ca": "May", # cce1fa in iso-8859-7
'\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 "\u039c\u03b1\u03b9": "May", # cce1e9 in iso-8859-7
'\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 "\u0399\u03bf\u03cd\u03bd": "Jun", # c9effded in iso-8859-7
'\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 "\u0399\u03bf\u03bd": "Jun", # c9efed in iso-8859-7
'\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 "\u0399\u03bf\u03cd\u03bb": "Jul", # c9effdeb in iso-8859-7
'\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 "\u0399\u03bf\u03bb": "Jul", # c9f9eb in iso-8859-7
'\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 "\u0391\u03cd\u03b3": "Aug", # c1fde3 in iso-8859-7
'\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 "\u0391\u03c5\u03b3": "Aug", # c1f5e3 in iso-8859-7
'\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 "\u03a3\u03b5\u03c0": "Sep", # d3e5f0 in iso-8859-7
'\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 "\u039f\u03ba\u03c4": "Oct", # cfeaf4 in iso-8859-7
'\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 "\u039d\u03bf\u03ad": "Nov", # cdefdd in iso-8859-7
'\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 "\u039d\u03bf\u03b5": "Nov", # cdefe5 in iso-8859-7
'\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 "\u0394\u03b5\u03ba": "Dec", # c4e5ea in iso-8859-7
} }
_greek_wdays = { _greek_wdays = {
'\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 "\u039a\u03c5\u03c1": "Sun", # caf5f1 in iso-8859-7
'\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 "\u0394\u03b5\u03c5": "Mon", # c4e5f5 in iso-8859-7
'\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 "\u03a4\u03c1\u03b9": "Tue", # d4f1e9 in iso-8859-7
'\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 "\u03a4\u03b5\u03c4": "Wed", # d4e5f4 in iso-8859-7
'\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 "\u03a0\u03b5\u03bc": "Thu", # d0e5ec in iso-8859-7
'\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 "\u03a0\u03b1\u03c1": "Fri", # d0e1f1 in iso-8859-7
'\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 "\u03a3\u03b1\u03b2": "Sat", # d3e1e2 in iso-8859-7
} }
_greek_date_format_re = re.compile(r'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') _greek_date_format_re = re.compile(
r"([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)"
)
def _parse_date_greek(date_string): def _parse_date_greek(date_string):
@ -72,15 +74,17 @@ def _parse_date_greek(date_string):
return return
wday = _greek_wdays[m.group(1)] wday = _greek_wdays[m.group(1)]
month = _greek_months[m.group(3)] month = _greek_months[m.group(3)]
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ rfc822date = (
{ "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(offset)s"
'wday': wday, % {
'day': m.group(2), "wday": wday,
'month': month, "day": m.group(2),
'year': m.group(4), "month": month,
'hour': m.group(5), "year": m.group(4),
'minute': m.group(6), "hour": m.group(5),
'second': m.group(7), "minute": m.group(6),
'zonediff': m.group(8), "second": m.group(7),
"offset": m.group(8),
} }
)
return _parse_date_rfc822(rfc822date) return _parse_date_rfc822(rfc822date)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -31,21 +31,23 @@ from .w3dtf import _parse_date_w3dtf
# Unicode strings for Hungarian date strings # Unicode strings for Hungarian date strings
_hungarian_months = { _hungarian_months = {
'janu\u00e1r': '01', # e1 in iso-8859-2 "janu\u00e1r": "01", # e1 in iso-8859-2
'febru\u00e1ri': '02', # e1 in iso-8859-2 "febru\u00e1ri": "02", # e1 in iso-8859-2
'm\u00e1rcius': '03', # e1 in iso-8859-2 "m\u00e1rcius": "03", # e1 in iso-8859-2
'\u00e1prilis': '04', # e1 in iso-8859-2 "\u00e1prilis": "04", # e1 in iso-8859-2
'm\u00e1ujus': '05', # e1 in iso-8859-2 "m\u00e1ujus": "05", # e1 in iso-8859-2
'j\u00fanius': '06', # fa in iso-8859-2 "j\u00fanius": "06", # fa in iso-8859-2
'j\u00falius': '07', # fa in iso-8859-2 "j\u00falius": "07", # fa in iso-8859-2
'augusztus': '08', "augusztus": "08",
'szeptember': '09', "szeptember": "09",
'okt\u00f3ber': '10', # f3 in iso-8859-2 "okt\u00f3ber": "10", # f3 in iso-8859-2
'november': '11', "november": "11",
'december': '12', "december": "12",
} }
_hungarian_date_format_re = re.compile(r'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})([+-](\d{,2}:\d{2}))') _hungarian_date_format_re = re.compile(
r"(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})([+-](\d{,2}:\d{2}))"
)
def _parse_date_hungarian(date_string): def _parse_date_hungarian(date_string):
@ -56,17 +58,9 @@ def _parse_date_hungarian(date_string):
month = _hungarian_months[m.group(2)] month = _hungarian_months[m.group(2)]
day = m.group(3) day = m.group(3)
if len(day) == 1: if len(day) == 1:
day = '0' + day day = "0" + day
hour = m.group(4) hour = m.group(4)
if len(hour) == 1: if len(hour) == 1:
hour = '0' + hour hour = "0" + hour
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ w3dtfdate = f"{m.group(1)}-{month}-{day}T{hour}:{m.group(5)}{m.group(6)}"
{
'year': m.group(1),
'month': month,
'day': day,
'hour': hour,
'minute': m.group(5),
'zonediff': m.group(6),
}
return _parse_date_w3dtf(w3dtfdate) return _parse_date_w3dtf(w3dtfdate)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -38,36 +38,36 @@ import time
# Please note the order in templates is significant because we need a # Please note the order in templates is significant because we need a
# greedy match. # greedy match.
_iso8601_tmpl = [ _iso8601_tmpl = [
'YYYY-?MM-?DD', "YYYY-?MM-?DD",
'YYYY-0MM?-?DD', "YYYY-0MM?-?DD",
'YYYY-MM', "YYYY-MM",
'YYYY-?OOO', "YYYY-?OOO",
'YY-?MM-?DD', "YY-?MM-?DD",
'YY-?OOO', "YY-?OOO",
'YYYY', "YYYY",
'-YY-?MM', "-YY-?MM",
'-OOO', "-OOO",
'-YY', "-YY",
'--MM-?DD', "--MM-?DD",
'--MM', "--MM",
'---DD', "---DD",
'CC', "CC",
'', "",
] ]
_iso8601_re = [ _iso8601_re = [
tmpl.replace( tmpl.replace("YYYY", r"(?P<year>\d{4})")
'YYYY', r'(?P<year>\d{4})').replace( .replace("YY", r"(?P<year>\d\d)")
'YY', r'(?P<year>\d\d)').replace( .replace("MM", r"(?P<month>[01]\d)")
'MM', r'(?P<month>[01]\d)').replace( .replace("DD", r"(?P<day>[0123]\d)")
'DD', r'(?P<day>[0123]\d)').replace( .replace("OOO", r"(?P<ordinal>[0123]\d\d)")
'OOO', r'(?P<ordinal>[0123]\d\d)').replace( .replace("CC", r"(?P<century>\d\d$)")
'CC', r'(?P<century>\d\d$)') + r"(T?(?P<hour>\d{2}):(?P<minute>\d{2})"
+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})' + r"(:(?P<second>\d{2}))?"
+ r'(:(?P<second>\d{2}))?' + r"(\.(?P<fracsecond>\d+))?"
+ r'(\.(?P<fracsecond>\d+))?' + r"(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?"
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' for tmpl in _iso8601_tmpl
for tmpl in _iso8601_tmpl] ]
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
@ -83,21 +83,21 @@ def _parse_date_iso8601(date_string):
if m.span() == (0, 0): if m.span() == (0, 0):
return return
params = m.groupdict() params = m.groupdict()
ordinal = params.get('ordinal', 0) ordinal = params.get("ordinal", 0)
if ordinal: if ordinal:
ordinal = int(ordinal) ordinal = int(ordinal)
else: else:
ordinal = 0 ordinal = 0
year = params.get('year', '--') year = params.get("year", "--")
if not year or year == '--': if not year or year == "--":
year = time.gmtime()[0] year = time.gmtime()[0]
elif len(year) == 2: elif len(year) == 2:
# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
year = 100 * int(time.gmtime()[0] / 100) + int(year) year = 100 * int(time.gmtime()[0] / 100) + int(year)
else: else:
year = int(year) year = int(year)
month = params.get('month', '-') month = params.get("month", "-")
if not month or month == '-': if not month or month == "-":
# ordinals are NOT normalized by mktime, we simulate them # ordinals are NOT normalized by mktime, we simulate them
# by setting month=1, day=ordinal # by setting month=1, day=ordinal
if ordinal: if ordinal:
@ -105,13 +105,14 @@ def _parse_date_iso8601(date_string):
else: else:
month = time.gmtime()[1] month = time.gmtime()[1]
month = int(month) month = int(month)
day = params.get('day', 0) day = params.get("day", 0)
if not day: if not day:
# see above # see above
if ordinal: if ordinal:
day = ordinal day = ordinal
elif params.get('century', 0) or \ elif (
params.get('year', 0) or params.get('month', 0): params.get("century", 0) or params.get("year", 0) or params.get("month", 0)
):
day = 1 day = 1
else: else:
day = time.gmtime()[2] day = time.gmtime()[2]
@ -119,29 +120,38 @@ def _parse_date_iso8601(date_string):
day = int(day) day = int(day)
# special case of the century - is the first year of the 21st century # special case of the century - is the first year of the 21st century
# 2000 or 2001 ? The debate goes on... # 2000 or 2001 ? The debate goes on...
if 'century' in params: if "century" in params:
year = (int(params['century']) - 1) * 100 + 1 year = (int(params["century"]) - 1) * 100 + 1
# in ISO 8601 most fields are optional # in ISO 8601 most fields are optional
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
if not params.get(field, None): if not params.get(field, None):
params[field] = 0 params[field] = 0
hour = int(params.get('hour', 0)) hour = int(params.get("hour", 0))
minute = int(params.get('minute', 0)) minute = int(params.get("minute", 0))
second = int(float(params.get('second', 0))) second = int(float(params.get("second", 0)))
# weekday is normalized by mktime(), we can ignore it # weekday is normalized by mktime(), we can ignore it
weekday = 0 weekday = 0
daylight_savings_flag = -1 daylight_savings_flag = -1
tm = [year, month, day, hour, minute, second, weekday, tm = [
ordinal, daylight_savings_flag] year,
month,
day,
hour,
minute,
second,
weekday,
ordinal,
daylight_savings_flag,
]
# ISO 8601 time zone adjustments # ISO 8601 time zone adjustments
tz = params.get('tz') tz = params.get("tz")
if tz and tz != 'Z': if tz and tz != "Z":
if tz[0] == '-': if tz[0] == "-":
tm[3] += int(params.get('tzhour', 0)) tm[3] += int(params.get("tzhour", 0))
tm[4] += int(params.get('tzmin', 0)) tm[4] += int(params.get("tzmin", 0))
elif tz[0] == '+': elif tz[0] == "+":
tm[3] -= int(params.get('tzhour', 0)) tm[3] -= int(params.get("tzhour", 0))
tm[4] -= int(params.get('tzmin', 0)) tm[4] -= int(params.get("tzmin", 0))
else: else:
return None return None
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c) # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -30,20 +30,21 @@ import re
from .w3dtf import _parse_date_w3dtf from .w3dtf import _parse_date_w3dtf
# 8-bit date handling routines written by ytrewq1. # 8-bit date handling routines written by ytrewq1.
_korean_year = '\ub144' # b3e2 in euc-kr _korean_year = "\ub144" # b3e2 in euc-kr
_korean_month = '\uc6d4' # bff9 in euc-kr _korean_month = "\uc6d4" # bff9 in euc-kr
_korean_day = '\uc77c' # c0cf in euc-kr _korean_day = "\uc77c" # c0cf in euc-kr
_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr _korean_am = "\uc624\uc804" # bfc0 c0fc in euc-kr
_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr _korean_pm = "\uc624\ud6c4" # bfc0 c8c4 in euc-kr
_korean_onblog_date_re = re.compile( _korean_onblog_date_re = re.compile(
r'(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' r"(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})"
% (_korean_year, _korean_month, _korean_day) % (_korean_year, _korean_month, _korean_day)
) )
_korean_nate_date_re = re.compile( _korean_nate_date_re = re.compile(
r'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' r"(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})"
% (_korean_am, _korean_pm)) % (_korean_am, _korean_pm)
)
def _parse_date_onblog(dateString): def _parse_date_onblog(dateString):
@ -51,10 +52,18 @@ def _parse_date_onblog(dateString):
m = _korean_onblog_date_re.match(dateString) m = _korean_onblog_date_re.match(dateString)
if not m: if not m:
return return
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ w3dtfdate = (
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3), "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s"
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), % {
'zonediff': '+09:00'} "year": m.group(1),
"month": m.group(2),
"day": m.group(3),
"hour": m.group(4),
"minute": m.group(5),
"second": m.group(6),
"zonediff": "+09:00",
}
)
return _parse_date_w3dtf(w3dtfdate) return _parse_date_w3dtf(w3dtfdate)
@ -69,15 +78,17 @@ def _parse_date_nate(dateString):
hour += 12 hour += 12
hour = str(hour) hour = str(hour)
if len(hour) == 1: if len(hour) == 1:
hour = '0' + hour hour = "0" + hour
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ w3dtfdate = (
{ "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s"
'year': m.group(1), % {
'month': m.group(2), "year": m.group(1),
'day': m.group(3), "month": m.group(2),
'hour': hour, "day": m.group(3),
'minute': m.group(6), "hour": hour,
'second': m.group(7), "minute": m.group(6),
'zonediff': '+09:00', "second": m.group(7),
"zonediff": "+09:00",
} }
)
return _parse_date_w3dtf(w3dtfdate) return _parse_date_w3dtf(w3dtfdate)

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -33,14 +33,31 @@ import time
def _parse_date_perforce(date_string): def _parse_date_perforce(date_string):
"""parse a date in yyyy/mm/dd hh:mm:ss TTT format""" """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
# Fri, 2006/09/15 08:19:53 EDT # Fri, 2006/09/15 08:19:53 EDT
_my_date_pattern = re.compile(r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') _my_date_pattern = re.compile(
r"(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})"
)
m = _my_date_pattern.search(date_string) m = _my_date_pattern.search(date_string)
if m is None: if m is None:
return None return None
dow, year, month, day, hour, minute, second, tz = m.groups() dow, year, month, day, hour, minute, second, tz = m.groups()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] months = [
new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) "Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec",
]
new_date_string = (
f"{dow}, {day} {months[int(month) - 1]} {year} {hour}:{minute}:{second} {tz}"
)
tm = email.utils.parsedate_tz(new_date_string) tm = email.utils.parsedate_tz(new_date_string)
if tm: if tm:
return time.gmtime(email.utils.mktime_tz(tm)) return time.gmtime(email.utils.mktime_tz(tm))

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -28,20 +28,45 @@
import datetime import datetime
timezone_names = { timezone_names = {
'ut': 0, 'gmt': 0, 'z': 0, "ut": 0,
'adt': -3, 'ast': -4, 'at': -4, "gmt": 0,
'edt': -4, 'est': -5, 'et': -5, "z": 0,
'cdt': -5, 'cst': -6, 'ct': -6, "adt": -3,
'mdt': -6, 'mst': -7, 'mt': -7, "ast": -4,
'pdt': -7, 'pst': -8, 'pt': -8, "at": -4,
'a': -1, 'n': 1, "edt": -4,
'm': -12, 'y': 12, "est": -5,
'met': 1, 'mest': 2, "et": -5,
"cdt": -5,
"cst": -6,
"ct": -6,
"mdt": -6,
"mst": -7,
"mt": -7,
"pdt": -7,
"pst": -8,
"pt": -8,
"a": -1,
"n": 1,
"m": -12,
"y": 12,
"met": 1,
"mest": 2,
} }
day_names = {'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'} day_names = {"mon", "tue", "wed", "thu", "fri", "sat", "sun"}
months = { months = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, "jan": 1,
'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, "feb": 2,
"mar": 3,
"apr": 4,
"may": 5,
"jun": 6,
"jul": 7,
"aug": 8,
"sep": 9,
"oct": 10,
"nov": 11,
"dec": 12,
} }
@ -63,7 +88,7 @@ def _parse_date_rfc822(date):
parts = date.lower().split() parts = date.lower().split()
if len(parts) < 5: if len(parts) < 5:
# Assume that the time and timezone are missing # Assume that the time and timezone are missing
parts.extend(('00:00:00', '0000')) parts.extend(("00:00:00", "0000"))
# Remove the day name # Remove the day name
if parts[0][:3] in day_names: if parts[0][:3] in day_names:
parts = parts[1:] parts = parts[1:]
@ -101,26 +126,26 @@ def _parse_date_rfc822(date):
year += (1900, 2000)[year < 90] year += (1900, 2000)[year < 90]
# Handle the time (default to 00:00:00). # Handle the time (default to 00:00:00).
time_parts = parts[3].split(':') time_parts = parts[3].split(":")
time_parts.extend(('0',) * (3 - len(time_parts))) time_parts.extend(("0",) * (3 - len(time_parts)))
try: try:
(hour, minute, second) = [int(i) for i in time_parts] (hour, minute, second) = (int(i) for i in time_parts)
except ValueError: except ValueError:
return None return None
# Handle the timezone information, if any (default to +0000). # Handle the timezone information, if any (default to +0000).
# Strip 'Etc/' from the timezone. # Strip 'Etc/' from the timezone.
if parts[4].startswith('etc/'): if parts[4].startswith("etc/"):
parts[4] = parts[4][4:] parts[4] = parts[4][4:]
# Normalize timezones that start with 'gmt': # Normalize timezones that start with 'gmt':
# GMT-05:00 => -0500 # GMT-05:00 => -0500
# GMT => GMT # GMT => GMT
if parts[4].startswith('gmt'): if parts[4].startswith("gmt"):
parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' parts[4] = "".join(parts[4][3:].split(":")) or "gmt"
# Handle timezones like '-0500', '+0500', and 'EST' # Handle timezones like '-0500', '+0500', and 'EST'
if parts[4] and parts[4][0] in ('-', '+'): if parts[4] and parts[4][0] in ("-", "+"):
try: try:
if ':' in parts[4]: if ":" in parts[4]:
timezone_hours = int(parts[4][1:3]) timezone_hours = int(parts[4][1:3])
timezone_minutes = int(parts[4][4:]) timezone_minutes = int(parts[4][4:])
else: else:
@ -128,7 +153,7 @@ def _parse_date_rfc822(date):
timezone_minutes = int(parts[4][3:]) timezone_minutes = int(parts[4][3:])
except ValueError: except ValueError:
return None return None
if parts[4].startswith('-'): if parts[4].startswith("-"):
timezone_hours *= -1 timezone_hours *= -1
timezone_minutes *= -1 timezone_minutes *= -1
else: else:

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -28,14 +28,28 @@
import datetime import datetime
timezonenames = { timezonenames = {
'ut': 0, 'gmt': 0, 'z': 0, "ut": 0,
'adt': -3, 'ast': -4, 'at': -4, "gmt": 0,
'edt': -4, 'est': -5, 'et': -5, "z": 0,
'cdt': -5, 'cst': -6, 'ct': -6, "adt": -3,
'mdt': -6, 'mst': -7, 'mt': -7, "ast": -4,
'pdt': -7, 'pst': -8, 'pt': -8, "at": -4,
'a': -1, 'n': 1, "edt": -4,
'm': -12, 'y': 12, "est": -5,
"et": -5,
"cdt": -5,
"cst": -6,
"ct": -6,
"mdt": -6,
"mst": -7,
"mt": -7,
"pdt": -7,
"pst": -8,
"pt": -8,
"a": -1,
"n": 1,
"m": -12,
"y": 12,
} }
# W3 date and time format parser # W3 date and time format parser
# http://www.w3.org/TR/NOTE-datetime # http://www.w3.org/TR/NOTE-datetime
@ -47,57 +61,57 @@ timezonenames = {
def _parse_date_w3dtf(datestr): def _parse_date_w3dtf(datestr):
if not datestr.strip(): if not datestr.strip():
return None return None
parts = datestr.lower().split('t') parts = datestr.lower().split("t")
if len(parts) == 1: if len(parts) == 1:
# This may be a date only, or may be an MSSQL-style date # This may be a date only, or may be an MSSQL-style date
parts = parts[0].split() parts = parts[0].split()
if len(parts) == 1: if len(parts) == 1:
# Treat this as a date only # Treat this as a date only
parts.append('00:00:00z') parts.append("00:00:00z")
elif len(parts) > 2: elif len(parts) > 2:
return None return None
date = parts[0].split('-', 2) date = parts[0].split("-", 2)
if not date or len(date[0]) != 4: if not date or len(date[0]) != 4:
return None return None
# Ensure that `date` has 3 elements. Using '1' sets the default # Ensure that `date` has 3 elements. Using '1' sets the default
# month to January and the default day to the 1st of the month. # month to January and the default day to the 1st of the month.
date.extend(['1'] * (3 - len(date))) date.extend(["1"] * (3 - len(date)))
try: try:
year, month, day = [int(i) for i in date] year, month, day = (int(i) for i in date)
except ValueError: except ValueError:
# `date` may have more than 3 elements or may contain # `date` may have more than 3 elements or may contain
# non-integer strings. # non-integer strings.
return None return None
if parts[1].endswith('z'): if parts[1].endswith("z"):
parts[1] = parts[1][:-1] parts[1] = parts[1][:-1]
parts.append('z') parts.append("z")
# Append the numeric timezone offset, if any, to parts. # Append the numeric timezone offset, if any, to parts.
# If this is an MSSQL-style date then parts[2] already contains # If this is an MSSQL-style date then parts[2] already contains
# the timezone information, so `append()` will not affect it. # the timezone information, so `append()` will not affect it.
# Add 1 to each value so that if `find()` returns -1 it will be # Add 1 to each value so that if `find()` returns -1 it will be
# treated as False. # treated as False.
loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 loc = parts[1].find("-") + 1 or parts[1].find("+") + 1 or len(parts[1]) + 1
loc = loc - 1 loc = loc - 1
parts.append(parts[1][loc:]) parts.append(parts[1][loc:])
parts[1] = parts[1][:loc] parts[1] = parts[1][:loc]
time = parts[1].split(':', 2) time = parts[1].split(":", 2)
# Ensure that time has 3 elements. Using '0' means that the # Ensure that time has 3 elements. Using '0' means that the
# minutes and seconds, if missing, will default to 0. # minutes and seconds, if missing, will default to 0.
time.extend(['0'] * (3 - len(time))) time.extend(["0"] * (3 - len(time)))
if parts[2][:1] in ('-', '+'): if parts[2][:1] in ("-", "+"):
try: try:
tzhour = int(parts[2][1:3]) tzhour = int(parts[2][1:3])
tzmin = int(parts[2][4:]) tzmin = int(parts[2][4:])
except ValueError: except ValueError:
return None return None
if parts[2].startswith('-'): if parts[2].startswith("-"):
tzhour = tzhour * -1 tzhour = tzhour * -1
tzmin = tzmin * -1 tzmin = tzmin * -1
else: else:
tzhour = timezonenames.get(parts[2], 0) tzhour = timezonenames.get(parts[2], 0)
tzmin = 0 tzmin = 0
try: try:
hour, minute, second = [int(float(i)) for i in time] hour, minute, second = (int(float(i)) for i in time)
except ValueError: except ValueError:
return None return None
# Create the datetime object and timezone delta objects # Create the datetime object and timezone delta objects

View file

@ -1,5 +1,5 @@
# Character encoding routines # Character encoding routines
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,9 +26,12 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from __future__ import annotations
import codecs import codecs
import io
import re import re
import typing as t import typing
try: try:
try: try:
@ -38,36 +41,38 @@ try:
except ImportError: except ImportError:
lazy_chardet_encoding = None lazy_chardet_encoding = None
else: else:
def lazy_chardet_encoding(data): def lazy_chardet_encoding(data):
return chardet.detect(data)['encoding'] or '' return chardet.detect(data)["encoding"] or ""
from .exceptions import ( from .exceptions import (
CharacterEncodingOverride, CharacterEncodingOverride,
CharacterEncodingUnknown, CharacterEncodingUnknown,
FeedparserError,
NonXMLContentType, NonXMLContentType,
) )
# Each marker represents some of the characters of the opening XML # Each marker represents some of the characters of the opening XML
# processing instruction ('<?xm') in the specified encoding. # processing instruction ('<?xm') in the specified encoding.
EBCDIC_MARKER = b'\x4C\x6F\xA7\x94' EBCDIC_MARKER = b"\x4C\x6F\xA7\x94"
UTF16BE_MARKER = b'\x00\x3C\x00\x3F' UTF16BE_MARKER = b"\x00\x3C\x00\x3F"
UTF16LE_MARKER = b'\x3C\x00\x3F\x00' UTF16LE_MARKER = b"\x3C\x00\x3F\x00"
UTF32BE_MARKER = b'\x00\x00\x00\x3C' UTF32BE_MARKER = b"\x00\x00\x00\x3C"
UTF32LE_MARKER = b'\x3C\x00\x00\x00' UTF32LE_MARKER = b"\x3C\x00\x00\x00"
ZERO_BYTES = '\x00\x00' ZERO_BYTES = b"\x00\x00"
# Match the opening XML declaration. # Match the opening XML declaration.
# Example: <?xml version="1.0" encoding="utf-8"?> # Example: <?xml version="1.0" encoding="utf-8"?>
RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>') RE_XML_DECLARATION = re.compile(r"^<\?xml[^>]*?>")
# Capture the value of the XML processing instruction's encoding attribute. # Capture the value of the XML processing instruction's encoding attribute.
# Example: <?xml version="1.0" encoding="utf-8"?> # Example: <?xml version="1.0" encoding="utf-8"?>
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>') RE_XML_PI_ENCODING = re.compile(rb'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
def parse_content_type(line: str) -> t.Tuple[str, str]: def parse_content_type(line: str) -> tuple[str, str]:
"""Parse an HTTP Content-Type header. """Parse an HTTP Content-Type header.
The return value will be a tuple of strings: The return value will be a tuple of strings:
@ -91,11 +96,10 @@ def parse_content_type(line: str) -> t.Tuple[str, str]:
return mime_type, charset_value return mime_type, charset_value
def convert_to_utf8(http_headers, data, result): def convert_to_utf8(
"""Detect and convert the character encoding to UTF-8. http_headers: dict[str, str], data: bytes, result: dict[str, typing.Any]
) -> bytes:
http_headers is a dictionary """Detect and convert the character encoding to UTF-8."""
data is a raw string (not Unicode)"""
# This is so much trickier than it sounds, it's not even funny. # This is so much trickier than it sounds, it's not even funny.
# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
@ -134,12 +138,10 @@ def convert_to_utf8(http_headers, data, result):
# Of course, none of this guarantees that we will be able to parse the # Of course, none of this guarantees that we will be able to parse the
# feed in the declared character encoding (assuming it was declared # feed in the declared character encoding (assuming it was declared
# correctly, which many are not). iconv_codec can help a lot; # correctly, which many are not).
# you should definitely install it if you can.
# http://cjkpython.i18n.org/
bom_encoding = '' bom_encoding = ""
xml_encoding = '' xml_encoding = ""
# Look at the first few bytes of the document to guess what # Look at the first few bytes of the document to guess what
# its encoding may be. We only need to decode enough of the # its encoding may be. We only need to decode enough of the
@ -149,50 +151,63 @@ def convert_to_utf8(http_headers, data, result):
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
# Check for BOMs first. # Check for BOMs first.
if data[:4] == codecs.BOM_UTF32_BE: if data[:4] == codecs.BOM_UTF32_BE:
bom_encoding = 'utf-32be' bom_encoding = "utf-32be"
data = data[4:] data = data[4:]
elif data[:4] == codecs.BOM_UTF32_LE: elif data[:4] == codecs.BOM_UTF32_LE:
bom_encoding = 'utf-32le' bom_encoding = "utf-32le"
data = data[4:] data = data[4:]
elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
bom_encoding = 'utf-16be' bom_encoding = "utf-16be"
data = data[2:] data = data[2:]
elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
bom_encoding = 'utf-16le' bom_encoding = "utf-16le"
data = data[2:] data = data[2:]
elif data[:3] == codecs.BOM_UTF8: elif data[:3] == codecs.BOM_UTF8:
bom_encoding = 'utf-8' bom_encoding = "utf-8"
data = data[3:] data = data[3:]
# Check for the characters '<?xm' in several encodings. # Check for the characters '<?xm' in several encodings.
elif data[:4] == EBCDIC_MARKER: elif data[:4] == EBCDIC_MARKER:
bom_encoding = 'cp037' bom_encoding = "cp037"
elif data[:4] == UTF16BE_MARKER: elif data[:4] == UTF16BE_MARKER:
bom_encoding = 'utf-16be' bom_encoding = "utf-16be"
elif data[:4] == UTF16LE_MARKER: elif data[:4] == UTF16LE_MARKER:
bom_encoding = 'utf-16le' bom_encoding = "utf-16le"
elif data[:4] == UTF32BE_MARKER: elif data[:4] == UTF32BE_MARKER:
bom_encoding = 'utf-32be' bom_encoding = "utf-32be"
elif data[:4] == UTF32LE_MARKER: elif data[:4] == UTF32LE_MARKER:
bom_encoding = 'utf-32le' bom_encoding = "utf-32le"
tempdata = data tempdata = data
try: try:
if bom_encoding: if bom_encoding:
tempdata = data.decode(bom_encoding).encode('utf-8') tempdata = data.decode(bom_encoding).encode("utf-8")
except UnicodeDecodeError: except UnicodeDecodeError:
xml_encoding_match = None xml_encoding_match = None
else: else:
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
if xml_encoding_match: if xml_encoding_match:
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() xml_encoding = xml_encoding_match.groups()[0].decode("utf-8").lower()
# Normalize the xml_encoding if necessary. # Normalize the xml_encoding if necessary.
if bom_encoding and (xml_encoding in ( if bom_encoding and (
'u16', 'utf-16', 'utf16', 'utf_16', xml_encoding
'u32', 'utf-32', 'utf32', 'utf_32', in (
'iso-10646-ucs-2', 'iso-10646-ucs-4', "u16",
'csucs4', 'csunicode', 'ucs-2', 'ucs-4' "utf-16",
)): "utf16",
"utf_16",
"u32",
"utf-32",
"utf32",
"utf_32",
"iso-10646-ucs-2",
"iso-10646-ucs-4",
"csucs4",
"csunicode",
"ucs-2",
"ucs-4",
)
):
xml_encoding = bom_encoding xml_encoding = bom_encoding
# Find the HTTP Content-Type and, hopefully, a character # Find the HTTP Content-Type and, hopefully, a character
@ -200,115 +215,436 @@ def convert_to_utf8(http_headers, data, result):
# to choose the "correct" encoding among the BOM encoding, # to choose the "correct" encoding among the BOM encoding,
# XML declaration encoding, and HTTP encoding, following the # XML declaration encoding, and HTTP encoding, following the
# heuristic defined in RFC 3023. # heuristic defined in RFC 3023.
http_content_type = http_headers.get('content-type') or '' http_content_type = http_headers.get("content-type") or ""
http_content_type, http_encoding = parse_content_type(http_content_type) http_content_type, http_encoding = parse_content_type(http_content_type)
acceptable_content_type = 0 acceptable_content_type = 0
application_content_types = ('application/xml', 'application/xml-dtd', application_content_types = (
'application/xml-external-parsed-entity') "application/xml",
text_content_types = ('text/xml', 'text/xml-external-parsed-entity') "application/xml-dtd",
json_content_types = ('application/feed+json', 'application/json') "application/xml-external-parsed-entity",
)
text_content_types = ("text/xml", "text/xml-external-parsed-entity")
json_content_types = ("application/feed+json", "application/json")
json = False json = False
if ( if http_content_type in application_content_types or (
http_content_type in application_content_types http_content_type.startswith("application/")
or ( and http_content_type.endswith("+xml")
http_content_type.startswith('application/')
and http_content_type.endswith('+xml')
)
): ):
acceptable_content_type = 1 acceptable_content_type = 1
rfc3023_encoding = http_encoding or xml_encoding or 'utf-8' rfc3023_encoding = http_encoding or xml_encoding or "utf-8"
elif ( elif http_content_type in text_content_types or (
http_content_type in text_content_types http_content_type.startswith("text/") and http_content_type.endswith("+xml")
or (
http_content_type.startswith('text/')
and http_content_type.endswith('+xml')
)
): ):
acceptable_content_type = 1 acceptable_content_type = 1
rfc3023_encoding = http_encoding or 'us-ascii' rfc3023_encoding = http_encoding or "us-ascii"
elif ( elif http_content_type in json_content_types or (
http_content_type in json_content_types not http_content_type and data and data.lstrip().startswith(b"{")
or (
not http_content_type
and data and data.lstrip()[0] == '{'
)
): ):
http_content_type = json_content_types[0] http_content_type = json_content_types[0]
acceptable_content_type = 1 acceptable_content_type = 1
json = True json = True
rfc3023_encoding = http_encoding or 'utf-8' # RFC 7159, 8.1. rfc3023_encoding = http_encoding or "utf-8" # RFC 7159, 8.1.
elif http_content_type.startswith('text/'): elif http_content_type.startswith("text/"):
rfc3023_encoding = http_encoding or 'us-ascii' rfc3023_encoding = http_encoding or "us-ascii"
elif http_headers and 'content-type' not in http_headers: elif http_headers and "content-type" not in http_headers:
rfc3023_encoding = xml_encoding or 'iso-8859-1' rfc3023_encoding = xml_encoding or "iso-8859-1"
else: else:
rfc3023_encoding = xml_encoding or 'utf-8' rfc3023_encoding = xml_encoding or "utf-8"
# gb18030 is a superset of gb2312, so always replace gb2312 # gb18030 is a superset of gb2312, so always replace gb2312
# with gb18030 for greater compatibility. # with gb18030 for greater compatibility.
if rfc3023_encoding.lower() == 'gb2312': if rfc3023_encoding.lower() == "gb2312":
rfc3023_encoding = 'gb18030' rfc3023_encoding = "gb18030"
if xml_encoding.lower() == 'gb2312': if xml_encoding.lower() == "gb2312":
xml_encoding = 'gb18030' xml_encoding = "gb18030"
# there are four encodings to keep track of: # there are four encodings to keep track of:
# - http_encoding is the encoding declared in the Content-Type HTTP header # - http_encoding is the encoding declared in the Content-Type HTTP header
# - xml_encoding is the encoding declared in the <?xml declaration # - xml_encoding is the encoding declared in the <?xml declaration
# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
# - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications # - rfc3023_encoding is the actual encoding, as per RFC 3023
error = None # and a variety of other conflicting specifications
error: FeedparserError | None = None
if http_headers and (not acceptable_content_type): if http_headers and (not acceptable_content_type):
if 'content-type' in http_headers: if "content-type" in http_headers:
msg = '%s is not an accepted media type' % http_headers['content-type'] msg = "%s is not an accepted media type" % http_headers["content-type"]
else: else:
msg = 'no Content-type specified' msg = "no Content-type specified"
error = NonXMLContentType(msg) error = NonXMLContentType(msg)
# determine character encoding # determine character encoding
known_encoding = 0 known_encoding = False
tried_encodings = [] tried_encodings = []
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding, for encoding_to_try in (
lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'): rfc3023_encoding,
if callable(proposed_encoding): xml_encoding,
proposed_encoding = proposed_encoding(data) bom_encoding,
lazy_chardet_encoding,
"utf-8",
"windows-1252",
"iso-8859-2",
):
if callable(encoding_to_try):
proposed_encoding = encoding_to_try(data)
else:
proposed_encoding = encoding_to_try
if not proposed_encoding: if not proposed_encoding:
continue continue
if proposed_encoding in tried_encodings: if proposed_encoding in tried_encodings:
continue continue
tried_encodings.append(proposed_encoding) tried_encodings.append(proposed_encoding)
try: try:
data = data.decode(proposed_encoding) text = data.decode(proposed_encoding)
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError):
pass continue
else:
known_encoding = 1 known_encoding = True
if not json: if not json:
# Update the encoding in the opening XML processing instruction. # Update the encoding in the opening XML processing instruction.
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>''' new_declaration = """<?xml version='1.0' encoding='utf-8'?>"""
if RE_XML_DECLARATION.search(data): if RE_XML_DECLARATION.search(text):
data = RE_XML_DECLARATION.sub(new_declaration, data) text = RE_XML_DECLARATION.sub(new_declaration, text)
else: else:
data = new_declaration + '\n' + data text = new_declaration + "\n" + text
data = data.encode('utf-8') data = text.encode("utf-8")
break break
# if still no luck, give up # if still no luck, give up
if not known_encoding: if not known_encoding:
error = CharacterEncodingUnknown( error = CharacterEncodingUnknown(
'document encoding unknown, I tried ' + "document encoding unknown, I tried "
'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % + "%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked"
(rfc3023_encoding, xml_encoding)) % (rfc3023_encoding, xml_encoding)
rfc3023_encoding = '' )
rfc3023_encoding = ""
elif proposed_encoding != rfc3023_encoding: elif proposed_encoding != rfc3023_encoding:
error = CharacterEncodingOverride( error = CharacterEncodingOverride(
'document declared as %s, but parsed as %s' % "document declared as %s, but parsed as %s"
(rfc3023_encoding, proposed_encoding)) % (rfc3023_encoding, proposed_encoding)
)
rfc3023_encoding = proposed_encoding rfc3023_encoding = proposed_encoding
result['content-type'] = http_content_type # for selecting the parser result["content-type"] = http_content_type # for selecting the parser
result['encoding'] = rfc3023_encoding result["encoding"] = rfc3023_encoding
if error: if error:
result['bozo'] = True result["bozo"] = True
result['bozo_exception'] = error result["bozo_exception"] = error
return data return data
# How much to read from a binary file in order to detect encoding.
# In inital tests, 4k was enough for ~160 mostly-English feeds;
# 64k seems like a safe margin.
CONVERT_FILE_PREFIX_LEN = 2**16
# How much to read from a text file, and use as an utf-8 bytes prefix.
# Note that no encoding detection is needed in this case.
CONVERT_FILE_STR_PREFIX_LEN = 2**13
CONVERT_FILE_TEST_CHUNK_LEN = 2**16
def convert_file_to_utf8(
http_headers, file, result, optimistic_encoding_detection=True
):
"""Like convert_to_utf8(), but for a stream.
Unlike convert_to_utf8(), do not read the entire file in memory;
instead, return a text stream that decodes it on the fly.
This should consume significantly less memory,
because it avoids (repeatedly) converting the entire file contents
from bytes to str and back.
To detect the encoding, only a prefix of the file contents is used.
In rare cases, the wrong encoding may be detected for this prefix;
use optimistic_encoding_detection=False to use the entire file contents
(equivalent to a plain convert_to_utf8() call).
Args:
http_headers (dict): The response headers.
file (IO[bytes] or IO[str]): A read()-able (binary) stream.
result (dict): The result dictionary.
optimistic_encoding_detection (bool):
If true, use only a prefix of the file content to detect encoding.
Returns:
StreamFactory: a stream factory, with the detected encoding set, if any
"""
# Currently, this wraps convert_to_utf8(), because the logic is simply
# too complicated to ensure it's re-implemented correctly for a stream.
# That said, it should be possible to change the implementation
# transparently (not sure it's worth it, though).
# If file is a text stream, we don't need to detect encoding;
# we still need a bytes prefix to run functions on for side effects:
# convert_to_utf8() to sniff / set result['content-type'], and
# replace_doctype() to extract safe_entities.
if isinstance(file.read(0), str):
prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode("utf-8")
prefix = convert_to_utf8(http_headers, prefix, result)
result["encoding"] = "utf-8"
return StreamFactory(prefix, file, "utf-8")
if optimistic_encoding_detection:
prefix = convert_file_prefix_to_utf8(http_headers, file, result)
factory = StreamFactory(prefix, file, result.get("encoding"))
# Before returning factory, ensure the entire file can be decoded;
# if it cannot, fall back to convert_to_utf8().
#
# Not doing this means feedparser.parse() may raise UnicodeDecodeError
# instead of setting bozo_exception to CharacterEncodingOverride,
# breaking the 6.x API.
try:
text_file = factory.get_text_file()
except MissingEncoding:
return factory
try:
# read in chunks to limit memory usage
while text_file.read(CONVERT_FILE_TEST_CHUNK_LEN):
pass
except UnicodeDecodeError:
# fall back to convert_to_utf8()
file = factory.get_binary_file()
else:
return factory
# this shouldn't increase memory usage if file is BytesIO,
# since BytesIO does copy-on-write; https://bugs.python.org/issue22003
data = convert_to_utf8(http_headers, file.read(), result)
# note that data *is* the prefix
return StreamFactory(data, io.BytesIO(b""), result.get("encoding"))
def convert_file_prefix_to_utf8(
http_headers,
file: typing.IO[bytes],
result,
*,
prefix_len: int = CONVERT_FILE_PREFIX_LEN,
read_to_ascii_len: int = 2**8,
) -> bytes:
"""Like convert_to_utf8(), but only use the prefix of a binary file.
Set result like convert_to_utf8() would.
Return the updated prefix, as bytes.
"""
# This is complicated by convert_to_utf8() detecting the wrong encoding
# if we have only part of the bytes that make a code-point:
#
# '😀'.encode('utf-8') -> utf-8
# '😀'.encode('utf-8')[:-1] -> windows-1252 + bozo
prefix = file.read(prefix_len - 1)
# reading up to after an ASCII byte increases
# the likelihood of being on a code point boundary
prefix += read_to_after_ascii_byte(file, read_to_ascii_len)
# call convert_to_utf8() up to 4 times,
# to make sure we eventually land on a code point boundary
candidates = []
for attempt in range(4):
byte = file.read(1)
# we're at the end of the file, and the loop already ran once
if not byte and attempt != 0:
break
prefix += byte
fake_result: typing.Any = {}
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
# an encoding was detected successfully, keep it
if not fake_result.get("bozo"):
break
candidates.append((file.tell(), converted_prefix, fake_result))
# no encoding was detected successfully, pick the "best" one
else:
def key(candidate):
*_, result = candidate
exc = result.get("bozo_exception")
exc_score = 0
if isinstance(exc, NonXMLContentType):
exc_score = 20
elif isinstance(exc, CharacterEncodingOverride):
exc_score = 10
return (
exc_score,
# prefer utf- encodings to anything else
result.get("encoding").startswith("utf-"),
)
candidates.sort(key=key)
offset, converted_prefix, fake_result = candidates[-1]
file.seek(offset)
result.update(fake_result)
return converted_prefix
def read_to_after_ascii_byte(file: typing.IO[bytes], max_len: int) -> bytes:
offset = file.tell()
buffer = b""
for _ in range(max_len):
byte = file.read(1)
# end of file, nothing to do
if not byte:
break
buffer += byte
# we stop after a ASCII character
if byte < b"\x80":
break
# couldn't find an ASCII character, reset the file to the original offset
else:
file.seek(offset)
return b""
return buffer
class MissingEncoding(io.UnsupportedOperation):
pass
class StreamFactory:
"""Decode on the fly a binary stream that *may* have a known encoding.
If the underlying stream is seekable, it is possible to call
the get_{text,binary}_file() methods more than once.
"""
def __init__(self, prefix: bytes, file, encoding=None):
self.prefix = prefix
self.file = ResetFileWrapper(file)
self.encoding = encoding
self.should_reset = False
def get_text_file(self, fallback_encoding=None, errors="strict"):
encoding = self.encoding or fallback_encoding
if encoding is None:
raise MissingEncoding("cannot create text stream without encoding")
if isinstance(self.file.read(0), str):
file = PrefixFileWrapper(self.prefix.decode(encoding), self.file)
else:
file = PrefixFileWrapper(
self.prefix.decode("utf-8", errors),
codecs.getreader(encoding)(self.file, errors),
)
self.reset()
return file
def get_binary_file(self):
if isinstance(self.file.read(0), str):
raise io.UnsupportedOperation(
"underlying stream is text, not binary"
) from None
file = PrefixFileWrapper(self.prefix, self.file)
self.reset()
return file
def get_file(self):
try:
return self.get_text_file()
except MissingEncoding:
return self.get_binary_file()
def reset(self):
if self.should_reset:
self.file.reset()
self.should_reset = True
class ResetFileWrapper:
"""Given a seekable file, allow reading its content again
(from the current position) by calling reset().
"""
def __init__(self, file):
self.file = file
try:
self.file_initial_offset = file.tell()
except OSError:
self.file_initial_offset = None
def read(self, size=-1):
return self.file.read(size)
def reset(self):
# raises io.UnsupportedOperation if the underlying stream is not seekable
self.file.seek(self.file_initial_offset)
class PrefixFileWrapper:
"""Stitch a (possibly modified) prefix and a file into a new file object.
>>> file = io.StringIO('abcdef')
>>> file.read(2)
'ab'
>>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
>>> wrapped.read()
'CDef'
"""
def __init__(self, prefix, file):
self.prefix = prefix
self.file = file
self.offset = 0
def read(self, size=-1):
buffer = self.file.read(0)
if self.offset < len(self.prefix):
if size < 0:
chunk = self.prefix
else:
chunk = self.prefix[self.offset : self.offset + size]
size -= len(chunk)
buffer += chunk
self.offset += len(chunk)
while True:
chunk = self.file.read(size)
if not chunk:
break
buffer += chunk
self.offset += len(chunk)
if size <= 0:
break
size -= len(chunk)
return buffer
def close(self):
# do not touch the underlying stream
pass

View file

@ -1,5 +1,5 @@
# Exceptions used throughout feedparser # Exceptions used throughout feedparser
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,11 +27,11 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
__all__ = [ __all__ = [
'FeedparserError', "FeedparserError",
'CharacterEncodingOverride', "CharacterEncodingOverride",
'CharacterEncodingUnknown', "CharacterEncodingUnknown",
'NonXMLContentType', "NonXMLContentType",
'UndeclaredNamespace', "UndeclaredNamespace",
] ]

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -28,36 +28,49 @@
import html.entities import html.entities
import re import re
import sgmllib3k as sgmllib # These items must all be imported into this module due to .__code__ replacements.
from .sgml import ( # noqa: F401
attrfind,
charref,
endbracket,
entityref,
incomplete,
interesting,
sgmllib,
shorttag,
shorttagopen,
starttagopen,
tagfind,
)
_cp1252 = { _cp1252 = {
128: '\u20ac', # euro sign 128: "\u20ac", # euro sign
130: '\u201a', # single low-9 quotation mark 130: "\u201a", # single low-9 quotation mark
131: '\u0192', # latin small letter f with hook 131: "\u0192", # latin small letter f with hook
132: '\u201e', # double low-9 quotation mark 132: "\u201e", # double low-9 quotation mark
133: '\u2026', # horizontal ellipsis 133: "\u2026", # horizontal ellipsis
134: '\u2020', # dagger 134: "\u2020", # dagger
135: '\u2021', # double dagger 135: "\u2021", # double dagger
136: '\u02c6', # modifier letter circumflex accent 136: "\u02c6", # modifier letter circumflex accent
137: '\u2030', # per mille sign 137: "\u2030", # per mille sign
138: '\u0160', # latin capital letter s with caron 138: "\u0160", # latin capital letter s with caron
139: '\u2039', # single left-pointing angle quotation mark 139: "\u2039", # single left-pointing angle quotation mark
140: '\u0152', # latin capital ligature oe 140: "\u0152", # latin capital ligature oe
142: '\u017d', # latin capital letter z with caron 142: "\u017d", # latin capital letter z with caron
145: '\u2018', # left single quotation mark 145: "\u2018", # left single quotation mark
146: '\u2019', # right single quotation mark 146: "\u2019", # right single quotation mark
147: '\u201c', # left double quotation mark 147: "\u201c", # left double quotation mark
148: '\u201d', # right double quotation mark 148: "\u201d", # right double quotation mark
149: '\u2022', # bullet 149: "\u2022", # bullet
150: '\u2013', # en dash 150: "\u2013", # en dash
151: '\u2014', # em dash 151: "\u2014", # em dash
152: '\u02dc', # small tilde 152: "\u02dc", # small tilde
153: '\u2122', # trade mark sign 153: "\u2122", # trade mark sign
154: '\u0161', # latin small letter s with caron 154: "\u0161", # latin small letter s with caron
155: '\u203a', # single right-pointing angle quotation mark 155: "\u203a", # single right-pointing angle quotation mark
156: '\u0153', # latin small ligature oe 156: "\u0153", # latin small ligature oe
158: '\u017e', # latin small letter z with caron 158: "\u017e", # latin small letter z with caron
159: '\u0178', # latin capital letter y with diaeresis 159: "\u0178", # latin capital letter y with diaeresis
} }
@ -65,28 +78,28 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
special = re.compile("""[<>'"]""") special = re.compile("""[<>'"]""")
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
elements_no_end_tag = { elements_no_end_tag = {
'area', "area",
'base', "base",
'basefont', "basefont",
'br', "br",
'col', "col",
'command', "command",
'embed', "embed",
'frame', "frame",
'hr', "hr",
'img', "img",
'input', "input",
'isindex', "isindex",
'keygen', "keygen",
'link', "link",
'meta', "meta",
'param', "param",
'source', "source",
'track', "track",
'wbr', "wbr",
} }
def __init__(self, encoding=None, _type='application/xhtml+xml'): def __init__(self, encoding=None, _type="application/xhtml+xml"):
if encoding: if encoding:
self.encoding = encoding self.encoding = encoding
self._type = _type self._type = _type
@ -105,9 +118,9 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
tag = match.group(1) tag = match.group(1)
if tag in self.elements_no_end_tag: if tag in self.elements_no_end_tag:
return '<' + tag + ' />' return "<" + tag + " />"
else: else:
return '<' + tag + '></' + tag + '>' return "<" + tag + "></" + tag + ">"
# By declaring these methods and overriding their compiled code # By declaring these methods and overriding their compiled code
# with the code from sgmllib, the original code will execute in # with the code from sgmllib, the original code will execute in
@ -128,8 +141,8 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
def parse_starttag(self, i): def parse_starttag(self, i):
j = self.__parse_starttag(i) j = self.__parse_starttag(i)
if self._type == 'application/xhtml+xml': if self._type == "application/xhtml+xml":
if j > 2 and self.rawdata[j-2:j] == '/>': if j > 2 and self.rawdata[j - 2 : j] == "/>":
self.unknown_endtag(self.lasttag) self.unknown_endtag(self.lasttag)
return j return j
@ -139,10 +152,10 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
:rtype: None :rtype: None
""" """
data = re.sub(r'<!((?!DOCTYPE|--|\[))', r'&lt;!\1', data, re.IGNORECASE) data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, re.IGNORECASE)
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = re.sub(r"<([^<>\s]+?)\s*/>", self._shorttag_replace, data)
data = data.replace('&#39;', "'") data = data.replace("&#39;", "'")
data = data.replace('&#34;', '"') data = data.replace("&#34;", '"')
super().feed(data) super().feed(data)
super().close() super().close()
@ -160,8 +173,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# *attrs* into a dictionary, then convert it back to a list. # *attrs* into a dictionary, then convert it back to a list.
attrs_d = {k.lower(): v for k, v in attrs} attrs_d = {k.lower(): v for k, v in attrs}
attrs = [ attrs = [
(k, k in ('rel', 'type') and v.lower() or v) (k, k in ("rel", "type") and v.lower() or v) for k, v in attrs_d.items()
for k, v in attrs_d.items()
] ]
attrs.sort() attrs.sort()
return attrs return attrs
@ -177,22 +189,19 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# attrs is a list of (attr, value) tuples # attrs is a list of (attr, value) tuples
# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')] # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
uattrs = [] uattrs = []
strattrs = '' strattrs = ""
if attrs: if attrs:
for key, value in attrs: for key, value in attrs:
value = value.replace('>', '&gt;') value = value.replace(">", "&gt;")
value = value.replace('<', '&lt;') value = value.replace("<", "&lt;")
value = value.replace('"', '&quot;') value = value.replace('"', "&quot;")
value = self.bare_ampersand.sub("&amp;", value) value = self.bare_ampersand.sub("&amp;", value)
uattrs.append((key, value)) uattrs.append((key, value))
strattrs = ''.join( strattrs = "".join(f' {key}="{value}"' for key, value in uattrs)
' %s="%s"' % (key, value)
for key, value in uattrs
)
if tag in self.elements_no_end_tag: if tag in self.elements_no_end_tag:
self.pieces.append('<%s%s />' % (tag, strattrs)) self.pieces.append(f"<{tag}{strattrs} />")
else: else:
self.pieces.append('<%s%s>' % (tag, strattrs)) self.pieces.append(f"<{tag}{strattrs}>")
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
""" """
@ -214,15 +223,15 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# Called for each character reference, e.g. '&#160;' will extract '160' # Called for each character reference, e.g. '&#160;' will extract '160'
# Reconstruct the original character reference. # Reconstruct the original character reference.
ref = ref.lower() ref = ref.lower()
if ref.startswith('x'): if ref.startswith("x"):
value = int(ref[1:], 16) value = int(ref[1:], 16)
else: else:
value = int(ref) value = int(ref)
if value in _cp1252: if value in _cp1252:
self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) self.pieces.append("&#%s;" % hex(ord(_cp1252[value]))[1:])
else: else:
self.pieces.append('&#%s;' % ref) self.pieces.append("&#%s;" % ref)
def handle_entityref(self, ref): def handle_entityref(self, ref):
""" """
@ -232,10 +241,10 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# Called for each entity reference, e.g. '&copy;' will extract 'copy' # Called for each entity reference, e.g. '&copy;' will extract 'copy'
# Reconstruct the original entity reference. # Reconstruct the original entity reference.
if ref in html.entities.name2codepoint or ref == 'apos': if ref in html.entities.name2codepoint or ref == "apos":
self.pieces.append('&%s;' % ref) self.pieces.append("&%s;" % ref)
else: else:
self.pieces.append('&amp;%s' % ref) self.pieces.append("&amp;%s" % ref)
def handle_data(self, text): def handle_data(self, text):
""" """
@ -256,7 +265,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# Called for HTML comments, e.g. <!-- insert Javascript code here --> # Called for HTML comments, e.g. <!-- insert Javascript code here -->
# Reconstruct the original comment. # Reconstruct the original comment.
self.pieces.append('<!--%s-->' % text) self.pieces.append("<!--%s-->" % text)
def handle_pi(self, text): def handle_pi(self, text):
""" """
@ -266,7 +275,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# Called for each processing instruction, e.g. <?instruction> # Called for each processing instruction, e.g. <?instruction>
# Reconstruct original processing instruction. # Reconstruct original processing instruction.
self.pieces.append('<?%s>' % text) self.pieces.append("<?%s>" % text)
def handle_decl(self, text): def handle_decl(self, text):
""" """
@ -278,9 +287,9 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
# "http://www.w3.org/TR/html4/loose.dtd"> # "http://www.w3.org/TR/html4/loose.dtd">
# Reconstruct original DOCTYPE # Reconstruct original DOCTYPE
self.pieces.append('<!%s>' % text) self.pieces.append("<!%s>" % text)
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match
def _scan_name(self, i, declstartpos): def _scan_name(self, i, declstartpos):
""" """
@ -311,7 +320,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
:rtype: str :rtype: str
""" """
return '&#%s;' % name return "&#%s;" % name
def convert_entityref(self, name): def convert_entityref(self, name):
""" """
@ -319,7 +328,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
:rtype: str :rtype: str
""" """
return '&%s;' % name return "&%s;" % name
def output(self): def output(self):
"""Return processed HTML as a single string. """Return processed HTML as a single string.
@ -327,7 +336,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
:rtype: str :rtype: str
""" """
return ''.join(self.pieces) return "".join(self.pieces)
def parse_declaration(self, i): def parse_declaration(self, i):
""" """
@ -339,5 +348,5 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
return sgmllib.SGMLParser.parse_declaration(self, i) return sgmllib.SGMLParser.parse_declaration(self, i)
except (AssertionError, sgmllib.SGMLParseError): except (AssertionError, sgmllib.SGMLParseError):
# Escape the doctype declaration and continue parsing. # Escape the doctype declaration and continue parsing.
self.handle_data('&lt;') self.handle_data("&lt;")
return i+1 return i + 1

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -25,203 +25,54 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
import base64 from __future__ import annotations
import datetime
import gzip import typing
import io
import re import requests
import struct
import urllib.parse
import urllib.request
import zlib
from .datetimes import _parse_date from .datetimes import _parse_date
from .urls import convert_to_idn
# HTTP "Accept" header to send to servers when downloading feeds.
ACCEPT_HEADER: str = (
"application/atom+xml"
",application/rdf+xml"
",application/rss+xml"
",application/x-netcdf"
",application/xml"
";q=0.9,text/xml"
";q=0.2,*/*"
";q=0.1"
)
# HTTP "Accept" header to send to servers when downloading feeds. If you don't def get(url: str, result: dict[str, typing.Any]) -> bytes:
# want to send an Accept header, set this to None.
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, headers):
# The default implementation just raises HTTPError.
# Forget that.
fp.status = code
return fp
def http_error_301(self, req, fp, code, msg, hdrs):
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
if not result:
return fp
result.status = code
result.newurl = result.geturl()
return result
# The default implementations in urllib.request.HTTPRedirectHandler
# are identical, so hardcoding a http_error_301 call above
# won't affect anything
http_error_300 = http_error_301
http_error_302 = http_error_301
http_error_303 = http_error_301
http_error_307 = http_error_301
def http_error_401(self, req, fp, code, msg, headers):
# Check if
# - server requires digest auth, AND
# - we tried (unsuccessfully) with basic auth, AND
# If all conditions hold, parse authentication information
# out of the Authorization header we sent the first time
# (for the username and password) and the WWW-Authenticate
# header the server sent back (for the realm) and retry
# the request with the appropriate digest auth headers instead.
# This evil genius hack has been brought to you by Aaron Swartz.
host = urllib.parse.urlparse(req.get_full_url())[1]
if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
return self.http_error_default(req, fp, code, msg, headers)
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
user, passw = auth.split(':')
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
self.add_password(realm, host, user, passw)
retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
self.reset_retry_count()
return retry
def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers):
request = urllib.request.Request(url)
request.add_header('User-Agent', agent)
if etag:
request.add_header('If-None-Match', etag)
if isinstance(modified, str):
modified = _parse_date(modified)
elif isinstance(modified, datetime.datetime):
modified = modified.utctimetuple()
if modified:
# format into an RFC 1123-compliant timestamp. We can't use
# time.strftime() since the %a and %b directives can be affected
# by the current locale, but RFC 2616 states that dates must be
# in English.
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
if referrer:
request.add_header('Referer', referrer)
request.add_header('Accept-encoding', 'gzip, deflate')
if auth:
request.add_header('Authorization', 'Basic %s' % auth)
if accept_header:
request.add_header('Accept', accept_header)
# use this for whatever -- cookies, special headers, etc
# [('Cookie','Something'),('x-special-header','Another Value')]
for header_name, header_value in request_headers.items():
request.add_header(header_name, header_value)
request.add_header('A-IM', 'feed') # RFC 3229 support
return request
def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None):
if handlers is None:
handlers = []
elif not isinstance(handlers, list):
handlers = [handlers]
if request_headers is None:
request_headers = {}
# Deal with the feed URI scheme
if url.startswith('feed:http'):
url = url[5:]
elif url.startswith('feed:'):
url = 'http:' + url[5:]
if not agent:
from . import USER_AGENT from . import USER_AGENT
agent = USER_AGENT agent = USER_AGENT
# Test for inline user:password credentials for HTTP basic auth
auth = None
if not url.startswith('ftp:'):
url_pieces = urllib.parse.urlparse(url)
if url_pieces.username:
new_pieces = list(url_pieces)
new_pieces[1] = url_pieces.hostname
if url_pieces.port:
new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
url = urllib.parse.urlunparse(new_pieces)
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
# iri support
if not isinstance(url, bytes):
url = convert_to_idn(url)
# Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
bits = []
for c in url:
try: try:
c.encode('ascii') response = requests.get(
except UnicodeEncodeError: url,
bits.append(urllib.parse.quote(c)) headers={"User-Agent": agent, "Accept": ACCEPT_HEADER},
else: timeout=10,
bits.append(c) )
url = ''.join(bits) except requests.RequestException as exception:
result["bozo"] = True
result["bozo_exception"] = exception
return b""
# try to open with urllib2 (to use optional headers) # Lowercase the HTTP header keys for comparisons per RFC 2616.
request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers) result["headers"] = {k.lower(): v for k, v in response.headers.items()}
opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
f = opener.open(request)
data = f.read()
f.close()
# lowercase all of the HTTP headers for comparisons per RFC 2616
result['headers'] = {k.lower(): v for k, v in f.headers.items()}
# if feed is gzip-compressed, decompress it
if data and 'gzip' in result['headers'].get('content-encoding', ''):
try:
data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
except (EOFError, IOError, struct.error) as e:
# IOError can occur if the gzip header is bad.
# struct.error can occur if the data is damaged.
result['bozo'] = True
result['bozo_exception'] = e
if isinstance(e, struct.error):
# A gzip header was found but the data is corrupt.
# Ideally, we should re-request the feed without the
# 'Accept-encoding: gzip' header, but we don't.
data = None
elif data and 'deflate' in result['headers'].get('content-encoding', ''):
try:
data = zlib.decompress(data)
except zlib.error:
try:
# The data may have no headers and no checksum.
data = zlib.decompress(data, -15)
except zlib.error as e:
result['bozo'] = True
result['bozo_exception'] = e
# save HTTP headers # save HTTP headers
if 'etag' in result['headers']: if "etag" in result["headers"]:
etag = result['headers'].get('etag', '') result["etag"] = result["headers"]["etag"]
if isinstance(etag, bytes): if "last-modified" in result["headers"]:
etag = etag.decode('utf-8', 'ignore') modified = result["headers"]["last-modified"]
if etag:
result['etag'] = etag
if 'last-modified' in result['headers']:
modified = result['headers'].get('last-modified', '')
if modified: if modified:
result['modified'] = modified result["modified"] = modified
result['modified_parsed'] = _parse_date(modified) result["modified_parsed"] = _parse_date(modified)
if isinstance(f.url, bytes): result["href"] = response.url
result['href'] = f.url.decode('utf-8', 'ignore') result["status"] = response.status_code
else: return response.content
result['href'] = f.url
result['status'] = getattr(f, 'status', None) or 200
# Stop processing if the server sent HTTP 304 Not Modified.
if getattr(f, 'code', 0) == 304:
result['version'] = ''
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
'so the server sent no data. This is a feature, not a bug!'
return data

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -30,14 +30,20 @@ import binascii
import copy import copy
import html.entities import html.entities
import re import re
from typing import Dict
import xml.sax.saxutils import xml.sax.saxutils
from typing import Dict
from .html import _cp1252 from .html import _cp1252
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
from .sanitizer import sanitize_html, HTMLSanitizer from .sanitizer import HTMLSanitizer, sanitize_html
from .util import FeedParserDict
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
from .util import FeedParserDict
email_pattern = re.compile(
r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)"
r"|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))"
r"(\?subject=\S+)?"
)
class XMLParserMixin( class XMLParserMixin(
@ -50,117 +56,118 @@ class XMLParserMixin(
psc.Namespace, psc.Namespace,
): ):
namespaces = { namespaces = {
'': '', "": "",
'http://backend.userland.com/rss': '', "http://backend.userland.com/rss": "",
'http://blogs.law.harvard.edu/tech/rss': '', "http://blogs.law.harvard.edu/tech/rss": "",
'http://purl.org/rss/1.0/': '', "http://purl.org/rss/1.0/": "",
'http://my.netscape.com/rdf/simple/0.9/': '', "http://my.netscape.com/rdf/simple/0.9/": "",
'http://example.com/newformat#': '', "http://example.com/newformat#": "",
'http://example.com/necho': '', "http://example.com/necho": "",
'http://purl.org/echo/': '', "http://purl.org/echo/": "",
'uri/of/echo/namespace#': '', "uri/of/echo/namespace#": "",
'http://purl.org/pie/': '', "http://purl.org/pie/": "",
'http://purl.org/atom/ns#': '', "http://purl.org/atom/ns#": "",
'http://www.w3.org/2005/Atom': '', "http://www.w3.org/2005/Atom": "",
'http://purl.org/rss/1.0/modules/rss091#': '', "http://purl.org/rss/1.0/modules/rss091#": "",
"http://webns.net/mvcb/": "admin",
'http://webns.net/mvcb/': 'admin', "http://purl.org/rss/1.0/modules/aggregation/": "ag",
'http://purl.org/rss/1.0/modules/aggregation/': 'ag', "http://purl.org/rss/1.0/modules/annotate/": "annotate",
'http://purl.org/rss/1.0/modules/annotate/': 'annotate', "http://media.tangent.org/rss/1.0/": "audio",
'http://media.tangent.org/rss/1.0/': 'audio', "http://backend.userland.com/blogChannelModule": "blogChannel",
'http://backend.userland.com/blogChannelModule': 'blogChannel', "http://creativecommons.org/ns#license": "cc",
'http://creativecommons.org/ns#license': 'cc', "http://web.resource.org/cc/": "cc",
'http://web.resource.org/cc/': 'cc', "http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": (
'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativeCommons', "creativeCommons"
'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', ),
'http://purl.org/rss/1.0/modules/company': 'co', "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
'http://purl.org/rss/1.0/modules/content/': 'content', "http://purl.org/rss/1.0/modules/company": "co",
'http://my.theinfo.org/changed/1.0/rss/': 'cp', "http://purl.org/rss/1.0/modules/content/": "content",
'http://purl.org/dc/elements/1.1/': 'dc', "http://my.theinfo.org/changed/1.0/rss/": "cp",
'http://purl.org/dc/terms/': 'dcterms', "http://purl.org/dc/elements/1.1/": "dc",
'http://purl.org/rss/1.0/modules/email/': 'email', "http://purl.org/dc/terms/": "dcterms",
'http://purl.org/rss/1.0/modules/event/': 'ev', "http://purl.org/rss/1.0/modules/email/": "email",
'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', "http://purl.org/rss/1.0/modules/event/": "ev",
'http://freshmeat.net/rss/fm/': 'fm', "http://rssnamespace.org/feedburner/ext/1.0": "feedburner",
'http://xmlns.com/foaf/0.1/': 'foaf', "http://freshmeat.net/rss/fm/": "fm",
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', "http://xmlns.com/foaf/0.1/": "foaf",
'http://www.georss.org/georss': 'georss', "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo",
'http://www.opengis.net/gml': 'gml', "http://www.georss.org/georss": "georss",
'http://postneo.com/icbm/': 'icbm', "http://www.opengis.net/gml": "gml",
'http://purl.org/rss/1.0/modules/image/': 'image', "http://postneo.com/icbm/": "icbm",
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', "http://purl.org/rss/1.0/modules/image/": "image",
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes",
'http://purl.org/rss/1.0/modules/link/': 'l', "http://example.com/DTDs/PodCast-1.0.dtd": "itunes",
'http://search.yahoo.com/mrss': 'media', "http://purl.org/rss/1.0/modules/link/": "l",
"http://search.yahoo.com/mrss": "media",
# Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
'http://search.yahoo.com/mrss/': 'media', "http://search.yahoo.com/mrss/": "media",
'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
'http://prismstandard.org/namespaces/1.2/basic/': 'prism', "http://prismstandard.org/namespaces/1.2/basic/": "prism",
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
'http://purl.org/rss/1.0/modules/reference/': 'ref', "http://purl.org/rss/1.0/modules/reference/": "ref",
'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
'http://purl.org/rss/1.0/modules/search/': 'search', "http://purl.org/rss/1.0/modules/search/": "search",
'http://purl.org/rss/1.0/modules/slash/': 'slash', "http://purl.org/rss/1.0/modules/slash/": "slash",
'http://schemas.xmlsoap.org/soap/envelope/': 'soap', "http://schemas.xmlsoap.org/soap/envelope/": "soap",
'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
'http://hacks.benhammersley.com/rss/streaming/': 'str', "http://hacks.benhammersley.com/rss/streaming/": "str",
'http://purl.org/rss/1.0/modules/subscription/': 'sub', "http://purl.org/rss/1.0/modules/subscription/": "sub",
'http://purl.org/rss/1.0/modules/syndication/': 'sy', "http://purl.org/rss/1.0/modules/syndication/": "sy",
'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', "http://schemas.pocketsoap.com/rss/myDescModule/": "szf",
'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
'http://purl.org/rss/1.0/modules/threading/': 'thr', "http://purl.org/rss/1.0/modules/threading/": "thr",
'http://purl.org/rss/1.0/modules/textinput/': 'ti', "http://purl.org/rss/1.0/modules/textinput/": "ti",
'http://madskills.com/public/xml/rss/module/trackback/': 'trackback', "http://madskills.com/public/xml/rss/module/trackback/": "trackback",
'http://wellformedweb.org/commentAPI/': 'wfw', "http://wellformedweb.org/commentAPI/": "wfw",
'http://purl.org/rss/1.0/modules/wiki/': 'wiki', "http://purl.org/rss/1.0/modules/wiki/": "wiki",
'http://www.w3.org/1999/xhtml': 'xhtml', "http://www.w3.org/1999/xhtml": "xhtml",
'http://www.w3.org/1999/xlink': 'xlink', "http://www.w3.org/1999/xlink": "xlink",
'http://www.w3.org/XML/1998/namespace': 'xml', "http://www.w3.org/XML/1998/namespace": "xml",
'http://podlove.org/simple-chapters': 'psc', "http://podlove.org/simple-chapters": "psc",
} }
_matchnamespaces: Dict[str, str] = {} _matchnamespaces: Dict[str, str] = {}
can_be_relative_uri = { can_be_relative_uri = {
'comments', "comments",
'docs', "docs",
'href', "href",
'icon', "icon",
'id', "id",
'link', "link",
'logo', "logo",
'url', "url",
'wfw_comment', "wfw_comment",
'wfw_commentrss', "wfw_commentrss",
} }
can_contain_relative_uris = { can_contain_relative_uris = {
'content', "content",
'copyright', "copyright",
'description', "description",
'info', "info",
'rights', "rights",
'subtitle', "subtitle",
'summary', "summary",
'tagline', "tagline",
'title', "title",
} }
can_contain_dangerous_markup = { can_contain_dangerous_markup = {
'content', "content",
'copyright', "copyright",
'description', "description",
'info', "info",
'rights', "rights",
'subtitle', "subtitle",
'summary', "summary",
'tagline', "tagline",
'title', "title",
} }
html_types = { html_types = {
'application/xhtml+xml', "application/xhtml+xml",
'text/html', "text/html",
} }
def __init__(self): def __init__(self):
@ -169,7 +176,7 @@ class XMLParserMixin(
self._matchnamespaces[k.lower()] = v self._matchnamespaces[k.lower()] = v
self.feeddata = FeedParserDict() # feed-level data self.feeddata = FeedParserDict() # feed-level data
self.entries = [] # list of entry-level data self.entries = [] # list of entry-level data
self.version = '' # feed type/version, see SUPPORTED_VERSIONS self.version = "" # feed type/version, see SUPPORTED_VERSIONS
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
self.resolve_relative_uris = False self.resolve_relative_uris = False
self.sanitize_html = False self.sanitize_html = False
@ -198,7 +205,7 @@ class XMLParserMixin(
self.depth = 0 self.depth = 0
self.hasContent = 0 self.hasContent = 0
if self.lang: if self.lang:
self.feeddata['language'] = self.lang.replace('_', '-') self.feeddata["language"] = self.lang.replace("_", "-")
# A map of the following form: # A map of the following form:
# { # {
@ -208,7 +215,7 @@ class XMLParserMixin(
# }, # },
# } # }
self.property_depth_map = {} self.property_depth_map = {}
super(XMLParserMixin, self).__init__() super().__init__()
def _normalize_attributes(self, kv): def _normalize_attributes(self, kv):
raise NotImplementedError raise NotImplementedError
@ -222,72 +229,80 @@ class XMLParserMixin(
# track xml:base and xml:lang # track xml:base and xml:lang
attrs_d = dict(attrs) attrs_d = dict(attrs)
baseuri = attrs_d.get('xml:base', attrs_d.get('base')) or self.baseuri baseuri = attrs_d.get("xml:base", attrs_d.get("base")) or self.baseuri
if isinstance(baseuri, bytes): if isinstance(baseuri, bytes):
baseuri = baseuri.decode(self.encoding, 'ignore') baseuri = baseuri.decode(self.encoding, "ignore")
# ensure that self.baseuri is always an absolute URI that # ensure that self.baseuri is always an absolute URI that
# uses a whitelisted URI scheme (e.g. not `javscript:`) # uses a whitelisted URI scheme (e.g. not `javscript:`)
if self.baseuri: if self.baseuri:
self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
else: else:
self.baseuri = _urljoin(self.baseuri, baseuri) self.baseuri = _urljoin(self.baseuri, baseuri)
lang = attrs_d.get('xml:lang', attrs_d.get('lang')) lang = attrs_d.get("xml:lang", attrs_d.get("lang"))
if lang == '': if lang == "":
# xml:lang could be explicitly set to '', we need to capture that # xml:lang could be explicitly set to '', we need to capture that
lang = None lang = None
elif lang is None: elif lang is None:
# if no xml:lang is specified, use parent lang # if no xml:lang is specified, use parent lang
lang = self.lang lang = self.lang
if lang: if lang:
if tag in ('feed', 'rss', 'rdf:RDF'): if tag in ("feed", "rss", "rdf:RDF"):
self.feeddata['language'] = lang.replace('_', '-') self.feeddata["language"] = lang.replace("_", "-")
self.lang = lang self.lang = lang
self.basestack.append(self.baseuri) self.basestack.append(self.baseuri)
self.langstack.append(lang) self.langstack.append(lang)
# track namespaces # track namespaces
for prefix, uri in attrs: for prefix, uri in attrs:
if prefix.startswith('xmlns:'): if prefix.startswith("xmlns:"):
self.track_namespace(prefix[6:], uri) self.track_namespace(prefix[6:], uri)
elif prefix == 'xmlns': elif prefix == "xmlns":
self.track_namespace(None, uri) self.track_namespace(None, uri)
# track inline content # track inline content
if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
if tag in ('xhtml:div', 'div'): if tag in ("xhtml:div", "div"):
return # typepad does this 10/2007 return # typepad does this 10/2007
# element declared itself as escaped markup, but it isn't really # element declared itself as escaped markup, but it isn't really
self.contentparams['type'] = 'application/xhtml+xml' self.contentparams["type"] = "application/xhtml+xml"
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
if tag.find(':') != -1: if tag.find(":") != -1:
prefix, tag = tag.split(':', 1) prefix, tag = tag.split(":", 1)
namespace = self.namespaces_in_use.get(prefix, '') namespace = self.namespaces_in_use.get(prefix, "")
if tag == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML': if tag == "math" and namespace == "http://www.w3.org/1998/Math/MathML":
attrs.append(('xmlns', namespace)) attrs.append(("xmlns", namespace))
if tag == 'svg' and namespace == 'http://www.w3.org/2000/svg': if tag == "svg" and namespace == "http://www.w3.org/2000/svg":
attrs.append(('xmlns', namespace)) attrs.append(("xmlns", namespace))
if tag == 'svg': if tag == "svg":
self.svgOK += 1 self.svgOK += 1
return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) return self.handle_data(f"<{tag}{self.strattrs(attrs)}>", escape=0)
# match namespaces # match namespaces
if tag.find(':') != -1: if tag.find(":") != -1:
prefix, suffix = tag.split(':', 1) prefix, suffix = tag.split(":", 1)
else: else:
prefix, suffix = '', tag prefix, suffix = "", tag
prefix = self.namespacemap.get(prefix, prefix) prefix = self.namespacemap.get(prefix, prefix)
if prefix: if prefix:
prefix = prefix + '_' prefix = prefix + "_"
# Special hack for better tracking of empty textinput/image elements in # Special hack for better tracking of empty textinput/image elements in
# illformed feeds. # illformed feeds.
if (not prefix) and tag not in ('title', 'link', 'description', 'name'): if (not prefix) and tag not in ("title", "link", "description", "name"):
self.intextinput = 0 self.intextinput = 0
if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): if (not prefix) and tag not in (
"title",
"link",
"description",
"url",
"href",
"width",
"height",
):
self.inimage = 0 self.inimage = 0
# call special handler (if defined) or default handler # call special handler (if defined) or default handler
methodname = '_start_' + prefix + suffix methodname = "_start_" + prefix + suffix
try: try:
method = getattr(self, methodname) method = getattr(self, methodname)
return method(attrs_d) return method(attrs_d)
@ -305,18 +320,18 @@ class XMLParserMixin(
def unknown_endtag(self, tag): def unknown_endtag(self, tag):
# match namespaces # match namespaces
if tag.find(':') != -1: if tag.find(":") != -1:
prefix, suffix = tag.split(':', 1) prefix, suffix = tag.split(":", 1)
else: else:
prefix, suffix = '', tag prefix, suffix = "", tag
prefix = self.namespacemap.get(prefix, prefix) prefix = self.namespacemap.get(prefix, prefix)
if prefix: if prefix:
prefix = prefix + '_' prefix = prefix + "_"
if suffix == 'svg' and self.svgOK: if suffix == "svg" and self.svgOK:
self.svgOK -= 1 self.svgOK -= 1
# call special handler (if defined) or default handler # call special handler (if defined) or default handler
methodname = '_end_' + prefix + suffix methodname = "_end_" + prefix + suffix
try: try:
if self.svgOK: if self.svgOK:
raise AttributeError() raise AttributeError()
@ -326,14 +341,14 @@ class XMLParserMixin(
self.pop(prefix + suffix) self.pop(prefix + suffix)
# track inline content # track inline content
if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
# element declared itself as escaped markup, but it isn't really # element declared itself as escaped markup, but it isn't really
if tag in ('xhtml:div', 'div'): if tag in ("xhtml:div", "div"):
return # typepad does this 10/2007 return # typepad does this 10/2007
self.contentparams['type'] = 'application/xhtml+xml' self.contentparams["type"] = "application/xhtml+xml"
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
tag = tag.split(':')[-1] tag = tag.split(":")[-1]
self.handle_data('</%s>' % tag, escape=0) self.handle_data("</%s>" % tag, escape=0)
# track xml:base and xml:lang going out of scope # track xml:base and xml:lang going out of scope
if self.basestack: if self.basestack:
@ -352,33 +367,33 @@ class XMLParserMixin(
if not self.elementstack: if not self.elementstack:
return return
ref = ref.lower() ref = ref.lower()
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): if ref in ("34", "38", "39", "60", "62", "x22", "x26", "x27", "x3c", "x3e"):
text = '&#%s;' % ref text = "&#%s;" % ref
else: else:
if ref[0] == 'x': if ref[0] == "x":
c = int(ref[1:], 16) c = int(ref[1:], 16)
else: else:
c = int(ref) c = int(ref)
text = chr(c).encode('utf-8') text = chr(c).encode("utf-8")
self.elementstack[-1][2].append(text) self.elementstack[-1][2].append(text)
def handle_entityref(self, ref): def handle_entityref(self, ref):
# Called for each entity reference, e.g. for '&copy;', ref is 'copy' # Called for each entity reference, e.g. for '&copy;', ref is 'copy'
if not self.elementstack: if not self.elementstack:
return return
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): if ref in ("lt", "gt", "quot", "amp", "apos"):
text = '&%s;' % ref text = "&%s;" % ref
elif ref in self.entities: elif ref in self.entities:
text = self.entities[ref] text = self.entities[ref]
if text.startswith('&#') and text.endswith(';'): if text.startswith("&#") and text.endswith(";"):
return self.handle_entityref(text) return self.handle_entityref(text)
else: else:
try: try:
html.entities.name2codepoint[ref] html.entities.name2codepoint[ref]
except KeyError: except KeyError:
text = '&%s;' % ref text = "&%s;" % ref
else: else:
text = chr(html.entities.name2codepoint[ref]).encode('utf-8') text = chr(html.entities.name2codepoint[ref]).encode("utf-8")
self.elementstack[-1][2].append(text) self.elementstack[-1][2].append(text)
def handle_data(self, text, escape=1): def handle_data(self, text, escape=1):
@ -386,7 +401,7 @@ class XMLParserMixin(
# not containing any character or entity references # not containing any character or entity references
if not self.elementstack: if not self.elementstack:
return return
if escape and self.contentparams.get('type') == 'application/xhtml+xml': if escape and self.contentparams.get("type") == "application/xhtml+xml":
text = xml.sax.saxutils.escape(text) text = xml.sax.saxutils.escape(text)
self.elementstack[-1][2].append(text) self.elementstack[-1][2].append(text)
@ -403,18 +418,18 @@ class XMLParserMixin(
def parse_declaration(self, i): def parse_declaration(self, i):
# Override internal declaration handler to handle CDATA blocks. # Override internal declaration handler to handle CDATA blocks.
if self.rawdata[i:i+9] == '<![CDATA[': if self.rawdata[i : i + 9] == "<![CDATA[":
k = self.rawdata.find(']]>', i) k = self.rawdata.find("]]>", i)
if k == -1: if k == -1:
# CDATA block began but didn't finish # CDATA block began but didn't finish
k = len(self.rawdata) k = len(self.rawdata)
return k return k
self.handle_data(xml.sax.saxutils.escape(self.rawdata[i+9:k]), 0) self.handle_data(xml.sax.saxutils.escape(self.rawdata[i + 9 : k]), 0)
return k+3 return k + 3
else: else:
k = self.rawdata.find('>', i) k = self.rawdata.find(">", i)
if k >= 0: if k >= 0:
return k+1 return k + 1
else: else:
# We have an incomplete CDATA block. # We have an incomplete CDATA block.
return k return k
@ -422,35 +437,35 @@ class XMLParserMixin(
@staticmethod @staticmethod
def map_content_type(content_type): def map_content_type(content_type):
content_type = content_type.lower() content_type = content_type.lower()
if content_type == 'text' or content_type == 'plain': if content_type == "text" or content_type == "plain":
content_type = 'text/plain' content_type = "text/plain"
elif content_type == 'html': elif content_type == "html":
content_type = 'text/html' content_type = "text/html"
elif content_type == 'xhtml': elif content_type == "xhtml":
content_type = 'application/xhtml+xml' content_type = "application/xhtml+xml"
return content_type return content_type
def track_namespace(self, prefix, uri): def track_namespace(self, prefix, uri):
loweruri = uri.lower() loweruri = uri.lower()
if not self.version: if not self.version:
if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'): if (prefix, loweruri) == (None, "http://my.netscape.com/rdf/simple/0.9/"):
self.version = 'rss090' self.version = "rss090"
elif loweruri == 'http://purl.org/rss/1.0/': elif loweruri == "http://purl.org/rss/1.0/":
self.version = 'rss10' self.version = "rss10"
elif loweruri == 'http://www.w3.org/2005/atom': elif loweruri == "http://www.w3.org/2005/atom":
self.version = 'atom10' self.version = "atom10"
if loweruri.find('backend.userland.com/rss') != -1: if loweruri.find("backend.userland.com/rss") != -1:
# match any backend.userland.com namespace # match any backend.userland.com namespace
uri = 'http://backend.userland.com/rss' uri = "http://backend.userland.com/rss"
loweruri = uri loweruri = uri
if loweruri in self._matchnamespaces: if loweruri in self._matchnamespaces:
self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacemap[prefix] = self._matchnamespaces[loweruri]
self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri
else: else:
self.namespaces_in_use[prefix or ''] = uri self.namespaces_in_use[prefix or ""] = uri
def resolve_uri(self, uri): def resolve_uri(self, uri):
return _urljoin(self.baseuri or '', uri) return _urljoin(self.baseuri or "", uri)
@staticmethod @staticmethod
def decode_entities(element, data): def decode_entities(element, data):
@ -458,8 +473,8 @@ class XMLParserMixin(
@staticmethod @staticmethod
def strattrs(attrs): def strattrs(attrs):
return ''.join( return "".join(
' %s="%s"' % (t[0], xml.sax.saxutils.escape(t[1], {'"': '&quot;'})) ' {}="{}"'.format(t[0], xml.sax.saxutils.escape(t[1], {'"': "&quot;"}))
for t in attrs for t in attrs
) )
@ -475,11 +490,14 @@ class XMLParserMixin(
element, expecting_text, pieces = self.elementstack.pop() element, expecting_text, pieces = self.elementstack.pop()
# Ensure each piece is a str for Python 3 # Ensure each piece is a str for Python 3
for (i, v) in enumerate(pieces): for i, v in enumerate(pieces):
if isinstance(v, bytes): if isinstance(v, bytes):
pieces[i] = v.decode('utf-8') pieces[i] = v.decode("utf-8")
if self.version == 'atom10' and self.contentparams.get('type', 'text') == 'application/xhtml+xml': if (
self.version == "atom10"
and self.contentparams.get("type", "text") == "application/xhtml+xml"
):
# remove enclosing child element, but only if it is a <div> and # remove enclosing child element, but only if it is a <div> and
# only if all the remaining content is nested underneath it. # only if all the remaining content is nested underneath it.
# This means that the divs would be retained in the following: # This means that the divs would be retained in the following:
@ -488,76 +506,95 @@ class XMLParserMixin(
del pieces[-1] del pieces[-1]
while pieces and len(pieces) > 1 and not pieces[0].strip(): while pieces and len(pieces) > 1 and not pieces[0].strip():
del pieces[0] del pieces[0]
if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1] == '</div>': if (
pieces
and (pieces[0] == "<div>" or pieces[0].startswith("<div "))
and pieces[-1] == "</div>"
):
depth = 0 depth = 0
for piece in pieces[:-1]: for piece in pieces[:-1]:
if piece.startswith('</'): if piece.startswith("</"):
depth -= 1 depth -= 1
if depth == 0: if depth == 0:
break break
elif piece.startswith('<') and not piece.endswith('/>'): elif piece.startswith("<") and not piece.endswith("/>"):
depth += 1 depth += 1
else: else:
pieces = pieces[1:-1] pieces = pieces[1:-1]
output = ''.join(pieces) output = "".join(pieces)
if strip_whitespace: if strip_whitespace:
output = output.strip() output = output.strip()
if not expecting_text: if not expecting_text:
return output return output
# decode base64 content # decode base64 content
if base64 and self.contentparams.get('base64', 0): if base64 and self.contentparams.get("base64", 0):
try: try:
output = base64.decodebytes(output.encode('utf8')).decode('utf8') output = base64.decodebytes(output.encode("utf8")).decode("utf8")
except (binascii.Error, binascii.Incomplete, UnicodeDecodeError): except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
pass pass
# resolve relative URIs # resolve relative URIs
if (element in self.can_be_relative_uri) and output: if (element in self.can_be_relative_uri) and output:
# do not resolve guid elements with isPermalink="false" # do not resolve guid elements with isPermalink="false"
if not element == 'id' or self.guidislink: if not element == "id" or self.guidislink:
output = self.resolve_uri(output) output = self.resolve_uri(output)
# decode entities within embedded markup # decode entities within embedded markup
if not self.contentparams.get('base64', 0): if not self.contentparams.get("base64", 0):
output = self.decode_entities(element, output) output = self.decode_entities(element, output)
# some feed formats require consumers to guess # some feed formats require consumers to guess
# whether the content is html or plain text # whether the content is html or plain text
if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain': if (
not self.version.startswith("atom")
and self.contentparams.get("type") == "text/plain"
):
if self.looks_like_html(output): if self.looks_like_html(output):
self.contentparams['type'] = 'text/html' self.contentparams["type"] = "text/html"
# remove temporary cruft from contentparams # remove temporary cruft from contentparams
try: try:
del self.contentparams['mode'] del self.contentparams["mode"]
except KeyError: except KeyError:
pass pass
try: try:
del self.contentparams['base64'] del self.contentparams["base64"]
except KeyError: except KeyError:
pass pass
is_htmlish = self.map_content_type(self.contentparams.get('type', 'text/html')) in self.html_types is_htmlish = (
self.map_content_type(self.contentparams.get("type", "text/html"))
in self.html_types
)
# resolve relative URIs within embedded markup # resolve relative URIs within embedded markup
if is_htmlish and self.resolve_relative_uris: if is_htmlish and self.resolve_relative_uris:
if element in self.can_contain_relative_uris: if element in self.can_contain_relative_uris:
output = resolve_relative_uris(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) output = resolve_relative_uris(
output,
self.baseuri,
self.encoding,
self.contentparams.get("type", "text/html"),
)
# sanitize embedded markup # sanitize embedded markup
if is_htmlish and self.sanitize_html: if is_htmlish and self.sanitize_html:
if element in self.can_contain_dangerous_markup: if element in self.can_contain_dangerous_markup:
output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html')) output = sanitize_html(
output, self.encoding, self.contentparams.get("type", "text/html")
)
if self.encoding and isinstance(output, bytes): if self.encoding and isinstance(output, bytes):
output = output.decode(self.encoding, 'ignore') output = output.decode(self.encoding, "ignore")
# address common error where people take data that is already # address common error where people take data that is already
# utf-8, presume that it is iso-8859-1, and re-encode it. # utf-8, presume that it is iso-8859-1, and re-encode it.
if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and not isinstance(output, bytes): if self.encoding in ("utf-8", "utf-8_INVALID_PYTHON_3") and not isinstance(
output, bytes
):
try: try:
output = output.encode('iso-8859-1').decode('utf-8') output = output.encode("iso-8859-1").decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError): except (UnicodeEncodeError, UnicodeDecodeError):
pass pass
@ -567,65 +604,74 @@ class XMLParserMixin(
# categories/tags/keywords/whatever are handled in _end_category or # categories/tags/keywords/whatever are handled in _end_category or
# _end_tags or _end_itunes_keywords # _end_tags or _end_itunes_keywords
if element in ('category', 'tags', 'itunes_keywords'): if element in ("category", "tags", "itunes_keywords"):
return output return output
if element == 'title' and -1 < self.title_depth <= self.depth: if element == "title" and -1 < self.title_depth <= self.depth:
return output return output
# store output in appropriate place(s) # store output in appropriate place(s)
if self.inentry and not self.insource: if self.inentry and not self.insource:
if element == 'content': if element == "content":
self.entries[-1].setdefault(element, []) self.entries[-1].setdefault(element, [])
contentparams = copy.deepcopy(self.contentparams) contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output contentparams["value"] = output
self.entries[-1][element].append(contentparams) self.entries[-1][element].append(contentparams)
elif element == 'link': elif element == "link":
if not self.inimage: if not self.inimage:
# query variables in urls in link elements are improperly # query variables in urls in link elements are improperly
# converted from `?a=1&b=2` to `?a=1&b;=2` as if they're # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
# unhandled character references. fix this special case. # unhandled character references. fix this special case.
output = output.replace('&amp;', '&') output = output.replace("&amp;", "&")
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
self.entries[-1][element] = output self.entries[-1][element] = output
if output: if output:
self.entries[-1]['links'][-1]['href'] = output self.entries[-1]["links"][-1]["href"] = output
else: else:
if element == 'description': if element == "description":
element = 'summary' element = "summary"
old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) old_value_depth = self.property_depth_map.setdefault(
self.entries[-1], {}
).get(element)
if old_value_depth is None or self.depth <= old_value_depth: if old_value_depth is None or self.depth <= old_value_depth:
self.property_depth_map[self.entries[-1]][element] = self.depth self.property_depth_map[self.entries[-1]][element] = self.depth
self.entries[-1][element] = output self.entries[-1][element] = output
if self.incontent: if self.incontent:
contentparams = copy.deepcopy(self.contentparams) contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output contentparams["value"] = output
self.entries[-1][element + '_detail'] = contentparams self.entries[-1][element + "_detail"] = contentparams
elif self.infeed or self.insource: # and (not self.intextinput) and (not self.inimage): elif (
self.infeed or self.insource
): # and (not self.intextinput) and (not self.inimage):
context = self._get_context() context = self._get_context()
if element == 'description': if element == "description":
element = 'subtitle' element = "subtitle"
context[element] = output context[element] = output
if element == 'link': if element == "link":
# fix query variables; see above for the explanation # fix query variables; see above for the explanation
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
context[element] = output context[element] = output
context['links'][-1]['href'] = output context["links"][-1]["href"] = output
elif self.incontent: elif self.incontent:
contentparams = copy.deepcopy(self.contentparams) contentparams = copy.deepcopy(self.contentparams)
contentparams['value'] = output contentparams["value"] = output
context[element + '_detail'] = contentparams context[element + "_detail"] = contentparams
return output return output
def push_content(self, tag, attrs_d, default_content_type, expecting_text): def push_content(self, tag, attrs_d, default_content_type, expecting_text):
self.incontent += 1 self.incontent += 1
if self.lang: if self.lang:
self.lang = self.lang.replace('_', '-') self.lang = self.lang.replace("_", "-")
self.contentparams = FeedParserDict({ self.contentparams = FeedParserDict(
'type': self.map_content_type(attrs_d.get('type', default_content_type)), {
'language': self.lang, "type": self.map_content_type(
'base': self.baseuri}) attrs_d.get("type", default_content_type)
self.contentparams['base64'] = self._is_base64(attrs_d, self.contentparams) ),
"language": self.lang,
"base": self.baseuri,
}
)
self.contentparams["base64"] = self._is_base64(attrs_d, self.contentparams)
self.push(tag, expecting_text) self.push(tag, expecting_text)
def pop_content(self, tag): def pop_content(self, tag):
@ -646,55 +692,61 @@ class XMLParserMixin(
""" """
# must have a close tag or an entity reference to qualify # must have a close tag or an entity reference to qualify
if not (re.search(r'</(\w+)>', s) or re.search(r'&#?\w+;', s)): if not (re.search(r"</(\w+)>", s) or re.search(r"&#?\w+;", s)):
return False return False
# all tags must be in a restricted subset of valid HTML tags # all tags must be in a restricted subset of valid HTML tags
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in HTMLSanitizer.acceptable_elements)): if any(
t
for t in re.findall(r"</?(\w+)", s)
if t.lower() not in HTMLSanitizer.acceptable_elements
):
return False return False
# all entities must have been defined as valid HTML entities # all entities must have been defined as valid HTML entities
if any((e for e in re.findall(r'&(\w+);', s) if e not in html.entities.entitydefs)): if any(
e for e in re.findall(r"&(\w+);", s) if e not in html.entities.entitydefs
):
return False return False
return True return True
def _map_to_standard_prefix(self, name): def _map_to_standard_prefix(self, name):
colonpos = name.find(':') colonpos = name.find(":")
if colonpos != -1: if colonpos != -1:
prefix = name[:colonpos] prefix = name[:colonpos]
suffix = name[colonpos+1:] suffix = name[colonpos + 1 :]
prefix = self.namespacemap.get(prefix, prefix) prefix = self.namespacemap.get(prefix, prefix)
name = prefix + ':' + suffix name = prefix + ":" + suffix
return name return name
def _get_attribute(self, attrs_d, name): def _get_attribute(self, attrs_d, name):
return attrs_d.get(self._map_to_standard_prefix(name)) return attrs_d.get(self._map_to_standard_prefix(name))
def _is_base64(self, attrs_d, contentparams): def _is_base64(self, attrs_d, contentparams):
if attrs_d.get('mode', '') == 'base64': if attrs_d.get("mode", "") == "base64":
return 1 return 1
if self.contentparams['type'].startswith('text/'): if self.contentparams["type"].startswith("text/"):
return 0 return 0
if self.contentparams['type'].endswith('+xml'): if self.contentparams["type"].endswith("+xml"):
return 0 return 0
if self.contentparams['type'].endswith('/xml'): if self.contentparams["type"].endswith("/xml"):
return 0 return 0
return 1 return 1
@staticmethod @staticmethod
def _enforce_href(attrs_d): def _enforce_href(attrs_d):
href = attrs_d.get('url', attrs_d.get('uri', attrs_d.get('href', None))) href = attrs_d.get("url", attrs_d.get("uri", attrs_d.get("href", None)))
if href: if href:
try: try:
del attrs_d['url'] del attrs_d["url"]
except KeyError: except KeyError:
pass pass
try: try:
del attrs_d['uri'] del attrs_d["uri"]
except KeyError: except KeyError:
pass pass
attrs_d['href'] = href attrs_d["href"] = href
return attrs_d return attrs_d
def _save(self, key, value, overwrite=False): def _save(self, key, value, overwrite=False):
@ -707,37 +759,37 @@ class XMLParserMixin(
def _get_context(self): def _get_context(self):
if self.insource: if self.insource:
context = self.sourcedata context = self.sourcedata
elif self.inimage and 'image' in self.feeddata: elif self.inimage and "image" in self.feeddata:
context = self.feeddata['image'] context = self.feeddata["image"]
elif self.intextinput: elif self.intextinput:
context = self.feeddata['textinput'] context = self.feeddata["textinput"]
elif self.inentry: elif self.inentry:
context = self.entries[-1] context = self.entries[-1]
else: else:
context = self.feeddata context = self.feeddata
return context return context
def _save_author(self, key, value, prefix='author'): def _save_author(self, key, value, prefix="author"):
context = self._get_context() context = self._get_context()
context.setdefault(prefix + '_detail', FeedParserDict()) context.setdefault(prefix + "_detail", FeedParserDict())
context[prefix + '_detail'][key] = value context[prefix + "_detail"][key] = value
self._sync_author_detail() self._sync_author_detail()
context.setdefault('authors', [FeedParserDict()]) context.setdefault("authors", [FeedParserDict()])
context['authors'][-1][key] = value context["authors"][-1][key] = value
def _save_contributor(self, key, value): def _save_contributor(self, key, value):
context = self._get_context() context = self._get_context()
context.setdefault('contributors', [FeedParserDict()]) context.setdefault("contributors", [FeedParserDict()])
context['contributors'][-1][key] = value context["contributors"][-1][key] = value
def _sync_author_detail(self, key='author'): def _sync_author_detail(self, key="author"):
context = self._get_context() context = self._get_context()
detail = context.get('%ss' % key, [FeedParserDict()])[-1] detail = context.get("%ss" % key, [FeedParserDict()])[-1]
if detail: if detail:
name = detail.get('name') name = detail.get("name")
email = detail.get('email') email = detail.get("email")
if name and email: if name and email:
context[key] = '%s (%s)' % (name, email) context[key] = f"{name} ({email})"
elif name: elif name:
context[key] = name context[key] = name
elif email: elif email:
@ -746,31 +798,31 @@ class XMLParserMixin(
author, email = context.get(key), None author, email = context.get(key), None
if not author: if not author:
return return
emailmatch = re.search(r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))(\?subject=\S+)?", author) emailmatch = email_pattern.search(author)
if emailmatch: if emailmatch:
email = emailmatch.group(0) email = emailmatch.group(0)
# probably a better way to do the following, but it passes # probably a better way to do the following, but it passes
# all the tests # all the tests
author = author.replace(email, '') author = author.replace(email, "")
author = author.replace('()', '') author = author.replace("()", "")
author = author.replace('<>', '') author = author.replace("<>", "")
author = author.replace('&lt;&gt;', '') author = author.replace("&lt;&gt;", "")
author = author.strip() author = author.strip()
if author and (author[0] == '('): if author and (author[0] == "("):
author = author[1:] author = author[1:]
if author and (author[-1] == ')'): if author and (author[-1] == ")"):
author = author[:-1] author = author[:-1]
author = author.strip() author = author.strip()
if author or email: if author or email:
context.setdefault('%s_detail' % key, detail) context.setdefault("%s_detail" % key, detail)
if author: if author:
detail['name'] = author detail["name"] = author
if email: if email:
detail['email'] = email detail["email"] = email
def _add_tag(self, term, scheme, label): def _add_tag(self, term, scheme, label):
context = self._get_context() context = self._get_context()
tags = context.setdefault('tags', []) tags = context.setdefault("tags", [])
if (not term) and (not scheme) and (not label): if (not term) and (not scheme) and (not label):
return return
value = FeedParserDict(term=term, scheme=scheme, label=label) value = FeedParserDict(term=term, scheme=scheme, label=label)
@ -781,8 +833,8 @@ class XMLParserMixin(
# This is a completely-made up element. Its semantics are determined # This is a completely-made up element. Its semantics are determined
# only by a single feed that precipitated bug report 392 on Google Code. # only by a single feed that precipitated bug report 392 on Google Code.
# In short, this is junk code. # In short, this is junk code.
self.push('tags', 1) self.push("tags", 1)
def _end_tags(self): def _end_tags(self):
for term in self.pop('tags').split(','): for term in self.pop("tags").split(","):
self._add_tag(term.strip(), None, None) self._add_tag(term.strip(), None, None)

View file

@ -1,5 +1,5 @@
# Support for the Atom, RSS, RDF, and CDF feed formats # Support for the Atom, RSS, RDF, and CDF feed formats
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -33,7 +33,7 @@ from ..urls import make_safe_absolute_uri
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
"""Support for the Atom, RSS, RDF, and CDF feed formats. """Support for the Atom, RSS, RDF, and CDF feed formats.
The feed formats all share common elements, some of which have conflicting The feed formats all share common elements, some of which have conflicting
@ -42,452 +42,490 @@ class Namespace(object):
""" """
supported_namespaces = { supported_namespaces = {
'': '', "": "",
'http://backend.userland.com/rss': '', "http://backend.userland.com/rss": "",
'http://blogs.law.harvard.edu/tech/rss': '', "http://blogs.law.harvard.edu/tech/rss": "",
'http://purl.org/rss/1.0/': '', "http://purl.org/rss/1.0/": "",
'http://my.netscape.com/rdf/simple/0.9/': '', "http://my.netscape.com/rdf/simple/0.9/": "",
'http://example.com/newformat#': '', "http://example.com/newformat#": "",
'http://example.com/necho': '', "http://example.com/necho": "",
'http://purl.org/echo/': '', "http://purl.org/echo/": "",
'uri/of/echo/namespace#': '', "uri/of/echo/namespace#": "",
'http://purl.org/pie/': '', "http://purl.org/pie/": "",
'http://purl.org/atom/ns#': '', "http://purl.org/atom/ns#": "",
'http://www.w3.org/2005/Atom': '', "http://www.w3.org/2005/Atom": "",
'http://purl.org/rss/1.0/modules/rss091#': '', "http://purl.org/rss/1.0/modules/rss091#": "",
} }
def _start_rss(self, attrs_d): def _start_rss(self, attrs_d):
versionmap = { versionmap = {
'0.91': 'rss091u', "0.91": "rss091u",
'0.92': 'rss092', "0.92": "rss092",
'0.93': 'rss093', "0.93": "rss093",
'0.94': 'rss094', "0.94": "rss094",
} }
# If we're here then this is an RSS feed. # If we're here then this is an RSS feed.
# If we don't have a version or have a version that starts with something # If we don't have a version or have a version that starts with something
# other than RSS then there's been a mistake. Correct it. # other than RSS then there's been a mistake. Correct it.
if not self.version or not self.version.startswith('rss'): if not self.version or not self.version.startswith("rss"):
attr_version = attrs_d.get('version', '') attr_version = attrs_d.get("version", "")
version = versionmap.get(attr_version) version = versionmap.get(attr_version)
if version: if version:
self.version = version self.version = version
elif attr_version.startswith('2.'): elif attr_version.startswith("2."):
self.version = 'rss20' self.version = "rss20"
else: else:
self.version = 'rss' self.version = "rss"
def _start_channel(self, attrs_d): def _start_channel(self, attrs_d):
self.infeed = 1 self.infeed = 1
self._cdf_common(attrs_d) self._cdf_common(attrs_d)
def _cdf_common(self, attrs_d): def _cdf_common(self, attrs_d):
if 'lastmod' in attrs_d: if "lastmod" in attrs_d:
self._start_modified({}) self._start_modified({})
self.elementstack[-1][-1] = attrs_d['lastmod'] self.elementstack[-1][-1] = attrs_d["lastmod"]
self._end_modified() self._end_modified()
if 'href' in attrs_d: if "href" in attrs_d:
self._start_link({}) self._start_link({})
self.elementstack[-1][-1] = attrs_d['href'] self.elementstack[-1][-1] = attrs_d["href"]
self._end_link() self._end_link()
def _start_feed(self, attrs_d): def _start_feed(self, attrs_d):
self.infeed = 1 self.infeed = 1
versionmap = {'0.1': 'atom01', versionmap = {"0.1": "atom01", "0.2": "atom02", "0.3": "atom03"}
'0.2': 'atom02',
'0.3': 'atom03'}
if not self.version: if not self.version:
attr_version = attrs_d.get('version') attr_version = attrs_d.get("version")
version = versionmap.get(attr_version) version = versionmap.get(attr_version)
if version: if version:
self.version = version self.version = version
else: else:
self.version = 'atom' self.version = "atom"
def _end_channel(self): def _end_channel(self):
self.infeed = 0 self.infeed = 0
_end_feed = _end_channel _end_feed = _end_channel
def _start_image(self, attrs_d): def _start_image(self, attrs_d):
context = self._get_context() context = self._get_context()
if not self.inentry: if not self.inentry:
context.setdefault('image', FeedParserDict()) context.setdefault("image", FeedParserDict())
self.inimage = 1 self.inimage = 1
self.title_depth = -1 self.title_depth = -1
self.push('image', 0) self.push("image", 0)
def _end_image(self): def _end_image(self):
self.pop('image') self.pop("image")
self.inimage = 0 self.inimage = 0
def _start_textinput(self, attrs_d): def _start_textinput(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('textinput', FeedParserDict()) context.setdefault("textinput", FeedParserDict())
self.intextinput = 1 self.intextinput = 1
self.title_depth = -1 self.title_depth = -1
self.push('textinput', 0) self.push("textinput", 0)
_start_textInput = _start_textinput _start_textInput = _start_textinput
def _end_textinput(self): def _end_textinput(self):
self.pop('textinput') self.pop("textinput")
self.intextinput = 0 self.intextinput = 0
_end_textInput = _end_textinput _end_textInput = _end_textinput
def _start_author(self, attrs_d): def _start_author(self, attrs_d):
self.inauthor = 1 self.inauthor = 1
self.push('author', 1) self.push("author", 1)
# Append a new FeedParserDict when expecting an author # Append a new FeedParserDict when expecting an author
context = self._get_context() context = self._get_context()
context.setdefault('authors', []) context.setdefault("authors", [])
context['authors'].append(FeedParserDict()) context["authors"].append(FeedParserDict())
_start_managingeditor = _start_author _start_managingeditor = _start_author
def _end_author(self): def _end_author(self):
self.pop('author') self.pop("author")
self.inauthor = 0 self.inauthor = 0
self._sync_author_detail() self._sync_author_detail()
_end_managingeditor = _end_author _end_managingeditor = _end_author
def _start_contributor(self, attrs_d): def _start_contributor(self, attrs_d):
self.incontributor = 1 self.incontributor = 1
context = self._get_context() context = self._get_context()
context.setdefault('contributors', []) context.setdefault("contributors", [])
context['contributors'].append(FeedParserDict()) context["contributors"].append(FeedParserDict())
self.push('contributor', 0) self.push("contributor", 0)
def _end_contributor(self): def _end_contributor(self):
self.pop('contributor') self.pop("contributor")
self.incontributor = 0 self.incontributor = 0
def _start_name(self, attrs_d): def _start_name(self, attrs_d):
self.push('name', 0) self.push("name", 0)
def _end_name(self): def _end_name(self):
value = self.pop('name') value = self.pop("name")
if self.inpublisher: if self.inpublisher:
self._save_author('name', value, 'publisher') self._save_author("name", value, "publisher")
elif self.inauthor: elif self.inauthor:
self._save_author('name', value) self._save_author("name", value)
elif self.incontributor: elif self.incontributor:
self._save_contributor('name', value) self._save_contributor("name", value)
elif self.intextinput: elif self.intextinput:
context = self._get_context() context = self._get_context()
context['name'] = value context["name"] = value
def _start_width(self, attrs_d): def _start_width(self, attrs_d):
self.push('width', 0) self.push("width", 0)
def _end_width(self): def _end_width(self):
value = self.pop('width') value = self.pop("width")
try: try:
value = int(value) value = int(value)
except ValueError: except ValueError:
value = 0 value = 0
if self.inimage: if self.inimage:
context = self._get_context() context = self._get_context()
context['width'] = value context["width"] = value
def _start_height(self, attrs_d): def _start_height(self, attrs_d):
self.push('height', 0) self.push("height", 0)
def _end_height(self): def _end_height(self):
value = self.pop('height') value = self.pop("height")
try: try:
value = int(value) value = int(value)
except ValueError: except ValueError:
value = 0 value = 0
if self.inimage: if self.inimage:
context = self._get_context() context = self._get_context()
context['height'] = value context["height"] = value
def _start_url(self, attrs_d): def _start_url(self, attrs_d):
self.push('href', 1) self.push("href", 1)
_start_homepage = _start_url _start_homepage = _start_url
_start_uri = _start_url _start_uri = _start_url
def _end_url(self): def _end_url(self):
value = self.pop('href') value = self.pop("href")
if self.inauthor: if self.inauthor:
self._save_author('href', value) self._save_author("href", value)
elif self.incontributor: elif self.incontributor:
self._save_contributor('href', value) self._save_contributor("href", value)
_end_homepage = _end_url _end_homepage = _end_url
_end_uri = _end_url _end_uri = _end_url
def _start_email(self, attrs_d): def _start_email(self, attrs_d):
self.push('email', 0) self.push("email", 0)
def _end_email(self): def _end_email(self):
value = self.pop('email') value = self.pop("email")
if self.inpublisher: if self.inpublisher:
self._save_author('email', value, 'publisher') self._save_author("email", value, "publisher")
elif self.inauthor: elif self.inauthor:
self._save_author('email', value) self._save_author("email", value)
elif self.incontributor: elif self.incontributor:
self._save_contributor('email', value) self._save_contributor("email", value)
def _start_subtitle(self, attrs_d): def _start_subtitle(self, attrs_d):
self.push_content('subtitle', attrs_d, 'text/plain', 1) self.push_content("subtitle", attrs_d, "text/plain", 1)
_start_tagline = _start_subtitle _start_tagline = _start_subtitle
def _end_subtitle(self): def _end_subtitle(self):
self.pop_content('subtitle') self.pop_content("subtitle")
_end_tagline = _end_subtitle _end_tagline = _end_subtitle
def _start_rights(self, attrs_d): def _start_rights(self, attrs_d):
self.push_content('rights', attrs_d, 'text/plain', 1) self.push_content("rights", attrs_d, "text/plain", 1)
_start_copyright = _start_rights _start_copyright = _start_rights
def _end_rights(self): def _end_rights(self):
self.pop_content('rights') self.pop_content("rights")
_end_copyright = _end_rights _end_copyright = _end_rights
def _start_item(self, attrs_d): def _start_item(self, attrs_d):
self.entries.append(FeedParserDict()) self.entries.append(FeedParserDict())
self.push('item', 0) self.push("item", 0)
self.inentry = 1 self.inentry = 1
self.guidislink = 0 self.guidislink = 0
self.title_depth = -1 self.title_depth = -1
id = self._get_attribute(attrs_d, 'rdf:about') id = self._get_attribute(attrs_d, "rdf:about")
if id: if id:
context = self._get_context() context = self._get_context()
context['id'] = id context["id"] = id
self._cdf_common(attrs_d) self._cdf_common(attrs_d)
_start_entry = _start_item _start_entry = _start_item
def _end_item(self): def _end_item(self):
self.pop('item') self.pop("item")
self.inentry = 0 self.inentry = 0
self.hasContent = 0 self.hasContent = 0
_end_entry = _end_item _end_entry = _end_item
def _start_language(self, attrs_d): def _start_language(self, attrs_d):
self.push('language', 1) self.push("language", 1)
def _end_language(self): def _end_language(self):
self.lang = self.pop('language') self.lang = self.pop("language")
def _start_webmaster(self, attrs_d): def _start_webmaster(self, attrs_d):
self.push('publisher', 1) self.push("publisher", 1)
def _end_webmaster(self): def _end_webmaster(self):
self.pop('publisher') self.pop("publisher")
self._sync_author_detail('publisher') self._sync_author_detail("publisher")
def _start_published(self, attrs_d): def _start_published(self, attrs_d):
self.push('published', 1) self.push("published", 1)
_start_issued = _start_published _start_issued = _start_published
_start_pubdate = _start_published _start_pubdate = _start_published
def _end_published(self): def _end_published(self):
value = self.pop('published') value = self.pop("published")
self._save('published_parsed', _parse_date(value), overwrite=True) self._save("published_parsed", _parse_date(value), overwrite=True)
_end_issued = _end_published _end_issued = _end_published
_end_pubdate = _end_published _end_pubdate = _end_published
def _start_updated(self, attrs_d): def _start_updated(self, attrs_d):
self.push('updated', 1) self.push("updated", 1)
_start_modified = _start_updated _start_modified = _start_updated
_start_lastbuilddate = _start_updated _start_lastbuilddate = _start_updated
def _end_updated(self): def _end_updated(self):
value = self.pop('updated') value = self.pop("updated")
parsed_value = _parse_date(value) parsed_value = _parse_date(value)
self._save('updated_parsed', parsed_value, overwrite=True) self._save("updated_parsed", parsed_value, overwrite=True)
_end_modified = _end_updated _end_modified = _end_updated
_end_lastbuilddate = _end_updated _end_lastbuilddate = _end_updated
def _start_created(self, attrs_d): def _start_created(self, attrs_d):
self.push('created', 1) self.push("created", 1)
def _end_created(self): def _end_created(self):
value = self.pop('created') value = self.pop("created")
self._save('created_parsed', _parse_date(value), overwrite=True) self._save("created_parsed", _parse_date(value), overwrite=True)
def _start_expirationdate(self, attrs_d): def _start_expirationdate(self, attrs_d):
self.push('expired', 1) self.push("expired", 1)
def _end_expirationdate(self): def _end_expirationdate(self):
self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) self._save("expired_parsed", _parse_date(self.pop("expired")), overwrite=True)
def _start_category(self, attrs_d): def _start_category(self, attrs_d):
term = attrs_d.get('term') term = attrs_d.get("term")
scheme = attrs_d.get('scheme', attrs_d.get('domain')) scheme = attrs_d.get("scheme", attrs_d.get("domain"))
label = attrs_d.get('label') label = attrs_d.get("label")
self._add_tag(term, scheme, label) self._add_tag(term, scheme, label)
self.push('category', 1) self.push("category", 1)
_start_keywords = _start_category _start_keywords = _start_category
def _end_category(self): def _end_category(self):
value = self.pop('category') value = self.pop("category")
if not value: if not value:
return return
context = self._get_context() context = self._get_context()
tags = context['tags'] tags = context["tags"]
if value and len(tags) and not tags[-1]['term']: if value and len(tags) and not tags[-1]["term"]:
tags[-1]['term'] = value tags[-1]["term"] = value
else: else:
self._add_tag(value, None, None) self._add_tag(value, None, None)
_end_keywords = _end_category _end_keywords = _end_category
def _start_cloud(self, attrs_d): def _start_cloud(self, attrs_d):
self._get_context()['cloud'] = FeedParserDict(attrs_d) self._get_context()["cloud"] = FeedParserDict(attrs_d)
def _start_link(self, attrs_d): def _start_link(self, attrs_d):
attrs_d.setdefault('rel', 'alternate') attrs_d.setdefault("rel", "alternate")
if attrs_d['rel'] == 'self': if attrs_d["rel"] == "self":
attrs_d.setdefault('type', 'application/atom+xml') attrs_d.setdefault("type", "application/atom+xml")
else: else:
attrs_d.setdefault('type', 'text/html') attrs_d.setdefault("type", "text/html")
context = self._get_context() context = self._get_context()
attrs_d = self._enforce_href(attrs_d) attrs_d = self._enforce_href(attrs_d)
if 'href' in attrs_d: if "href" in attrs_d:
attrs_d['href'] = self.resolve_uri(attrs_d['href']) attrs_d["href"] = self.resolve_uri(attrs_d["href"])
expecting_text = self.infeed or self.inentry or self.insource expecting_text = self.infeed or self.inentry or self.insource
context.setdefault('links', []) context.setdefault("links", [])
if not (self.inentry and self.inimage): if not (self.inentry and self.inimage):
context['links'].append(FeedParserDict(attrs_d)) context["links"].append(FeedParserDict(attrs_d))
if 'href' in attrs_d: if "href" in attrs_d:
if ( if (
attrs_d.get('rel') == 'alternate' attrs_d.get("rel") == "alternate"
and self.map_content_type(attrs_d.get('type')) in self.html_types and self.map_content_type(attrs_d.get("type")) in self.html_types
): ):
context['link'] = attrs_d['href'] context["link"] = attrs_d["href"]
else: else:
self.push('link', expecting_text) self.push("link", expecting_text)
def _end_link(self): def _end_link(self):
self.pop('link') self.pop("link")
def _start_guid(self, attrs_d): def _start_guid(self, attrs_d):
self.guidislink = (attrs_d.get('ispermalink', 'true') == 'true') self.guidislink = attrs_d.get("ispermalink", "true") == "true"
self.push('id', 1) self.push("id", 1)
_start_id = _start_guid _start_id = _start_guid
def _end_guid(self): def _end_guid(self):
value = self.pop('id') value = self.pop("id")
self._save('guidislink', self.guidislink and 'link' not in self._get_context()) self._save("guidislink", self.guidislink and "link" not in self._get_context())
if self.guidislink: if self.guidislink:
# guid acts as link, but only if 'ispermalink' is not present or is 'true', # guid acts as link, but only if 'ispermalink' is not present or is 'true',
# and only if the item doesn't already have a link element # and only if the item doesn't already have a link element
self._save('link', value) self._save("link", value)
_end_id = _end_guid _end_id = _end_guid
def _start_title(self, attrs_d): def _start_title(self, attrs_d):
if self.svgOK: if self.svgOK:
return self.unknown_starttag('title', list(attrs_d.items())) return self.unknown_starttag("title", list(attrs_d.items()))
self.push_content('title', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource) self.push_content(
"title", attrs_d, "text/plain", self.infeed or self.inentry or self.insource
)
def _end_title(self): def _end_title(self):
if self.svgOK: if self.svgOK:
return return
value = self.pop_content('title') value = self.pop_content("title")
if not value: if not value:
return return
self.title_depth = self.depth self.title_depth = self.depth
def _start_description(self, attrs_d): def _start_description(self, attrs_d):
context = self._get_context() context = self._get_context()
if 'summary' in context and not self.hasContent: if "summary" in context and not self.hasContent:
self._summaryKey = 'content' self._summaryKey = "content"
self._start_content(attrs_d) self._start_content(attrs_d)
else: else:
self.push_content('description', attrs_d, 'text/html', self.infeed or self.inentry or self.insource) self.push_content(
"description",
attrs_d,
"text/html",
self.infeed or self.inentry or self.insource,
)
def _start_abstract(self, attrs_d): def _start_abstract(self, attrs_d):
self.push_content('description', attrs_d, 'text/plain', self.infeed or self.inentry or self.insource) self.push_content(
"description",
attrs_d,
"text/plain",
self.infeed or self.inentry or self.insource,
)
def _end_description(self): def _end_description(self):
if self._summaryKey == 'content': if self._summaryKey == "content":
self._end_content() self._end_content()
else: else:
self.pop_content('description') self.pop_content("description")
self._summaryKey = None self._summaryKey = None
_end_abstract = _end_description _end_abstract = _end_description
def _start_info(self, attrs_d): def _start_info(self, attrs_d):
self.push_content('info', attrs_d, 'text/plain', 1) self.push_content("info", attrs_d, "text/plain", 1)
_start_feedburner_browserfriendly = _start_info _start_feedburner_browserfriendly = _start_info
def _end_info(self): def _end_info(self):
self.pop_content('info') self.pop_content("info")
_end_feedburner_browserfriendly = _end_info _end_feedburner_browserfriendly = _end_info
def _start_generator(self, attrs_d): def _start_generator(self, attrs_d):
if attrs_d: if attrs_d:
attrs_d = self._enforce_href(attrs_d) attrs_d = self._enforce_href(attrs_d)
if 'href' in attrs_d: if "href" in attrs_d:
attrs_d['href'] = self.resolve_uri(attrs_d['href']) attrs_d["href"] = self.resolve_uri(attrs_d["href"])
self._get_context()['generator_detail'] = FeedParserDict(attrs_d) self._get_context()["generator_detail"] = FeedParserDict(attrs_d)
self.push('generator', 1) self.push("generator", 1)
def _end_generator(self): def _end_generator(self):
value = self.pop('generator') value = self.pop("generator")
context = self._get_context() context = self._get_context()
if 'generator_detail' in context: if "generator_detail" in context:
context['generator_detail']['name'] = value context["generator_detail"]["name"] = value
def _start_summary(self, attrs_d): def _start_summary(self, attrs_d):
context = self._get_context() context = self._get_context()
if 'summary' in context and not self.hasContent: if "summary" in context and not self.hasContent:
self._summaryKey = 'content' self._summaryKey = "content"
self._start_content(attrs_d) self._start_content(attrs_d)
else: else:
self._summaryKey = 'summary' self._summaryKey = "summary"
self.push_content(self._summaryKey, attrs_d, 'text/plain', 1) self.push_content(self._summaryKey, attrs_d, "text/plain", 1)
def _end_summary(self): def _end_summary(self):
if self._summaryKey == 'content': if self._summaryKey == "content":
self._end_content() self._end_content()
else: else:
self.pop_content(self._summaryKey or 'summary') self.pop_content(self._summaryKey or "summary")
self._summaryKey = None self._summaryKey = None
def _start_enclosure(self, attrs_d): def _start_enclosure(self, attrs_d):
attrs_d = self._enforce_href(attrs_d) attrs_d = self._enforce_href(attrs_d)
context = self._get_context() context = self._get_context()
attrs_d['rel'] = 'enclosure' attrs_d["rel"] = "enclosure"
context.setdefault('links', []).append(FeedParserDict(attrs_d)) context.setdefault("links", []).append(FeedParserDict(attrs_d))
def _start_source(self, attrs_d): def _start_source(self, attrs_d):
if 'url' in attrs_d: if "url" in attrs_d:
# This means that we're processing a source element from an RSS 2.0 feed # This means that we're processing a source element from an RSS 2.0 feed
self.sourcedata['href'] = attrs_d['url'] self.sourcedata["href"] = attrs_d["url"]
self.push('source', 1) self.push("source", 1)
self.insource = 1 self.insource = 1
self.title_depth = -1 self.title_depth = -1
def _end_source(self): def _end_source(self):
self.insource = 0 self.insource = 0
value = self.pop('source') value = self.pop("source")
if value: if value:
self.sourcedata['title'] = value self.sourcedata["title"] = value
self._get_context()['source'] = copy.deepcopy(self.sourcedata) self._get_context()["source"] = copy.deepcopy(self.sourcedata)
self.sourcedata.clear() self.sourcedata.clear()
def _start_content(self, attrs_d): def _start_content(self, attrs_d):
self.hasContent = 1 self.hasContent = 1
self.push_content('content', attrs_d, 'text/plain', 1) self.push_content("content", attrs_d, "text/plain", 1)
src = attrs_d.get('src') src = attrs_d.get("src")
if src: if src:
self.contentparams['src'] = src self.contentparams["src"] = src
self.push('content', 1) self.push("content", 1)
def _start_body(self, attrs_d): def _start_body(self, attrs_d):
self.push_content('content', attrs_d, 'application/xhtml+xml', 1) self.push_content("content", attrs_d, "application/xhtml+xml", 1)
_start_xhtml_body = _start_body _start_xhtml_body = _start_body
def _start_content_encoded(self, attrs_d): def _start_content_encoded(self, attrs_d):
self.hasContent = 1 self.hasContent = 1
self.push_content('content', attrs_d, 'text/html', 1) self.push_content("content", attrs_d, "text/html", 1)
_start_fullitem = _start_content_encoded _start_fullitem = _start_content_encoded
def _end_content(self): def _end_content(self):
copyToSummary = self.map_content_type(self.contentparams.get('type')) in ({'text/plain'} | self.html_types) copyToSummary = self.map_content_type(self.contentparams.get("type")) in (
value = self.pop_content('content') {"text/plain"} | self.html_types
)
value = self.pop_content("content")
if copyToSummary: if copyToSummary:
self._save('summary', value) self._save("summary", value)
_end_body = _end_content _end_body = _end_content
_end_xhtml_body = _end_content _end_xhtml_body = _end_content
@ -495,12 +533,12 @@ class Namespace(object):
_end_fullitem = _end_content _end_fullitem = _end_content
def _start_newlocation(self, attrs_d): def _start_newlocation(self, attrs_d):
self.push('newlocation', 1) self.push("newlocation", 1)
def _end_newlocation(self): def _end_newlocation(self):
url = self.pop('newlocation') url = self.pop("newlocation")
context = self._get_context() context = self._get_context()
# don't set newlocation if the context isn't right # don't set newlocation if the context isn't right
if context is not self.feeddata: if context is not self.feeddata:
return return
context['newlocation'] = make_safe_absolute_uri(self.baseuri, url.strip()) context["newlocation"] = make_safe_absolute_uri(self.baseuri, url.strip())

View file

@ -1,5 +1,5 @@
# Support for the administrative elements extension # Support for the administrative elements extension
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -29,25 +29,25 @@
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
# RDF Site Summary 1.0 Modules: Administrative # RDF Site Summary 1.0 Modules: Administrative
# http://web.resource.org/rss/1.0/modules/admin/ # http://web.resource.org/rss/1.0/modules/admin/
supported_namespaces = { supported_namespaces = {
'http://webns.net/mvcb/': 'admin', "http://webns.net/mvcb/": "admin",
} }
def _start_admin_generatoragent(self, attrs_d): def _start_admin_generatoragent(self, attrs_d):
self.push('generator', 1) self.push("generator", 1)
value = self._get_attribute(attrs_d, 'rdf:resource') value = self._get_attribute(attrs_d, "rdf:resource")
if value: if value:
self.elementstack[-1][2].append(value) self.elementstack[-1][2].append(value)
self.pop('generator') self.pop("generator")
self._get_context()['generator_detail'] = FeedParserDict({'href': value}) self._get_context()["generator_detail"] = FeedParserDict({"href": value})
def _start_admin_errorreportsto(self, attrs_d): def _start_admin_errorreportsto(self, attrs_d):
self.push('errorreportsto', 1) self.push("errorreportsto", 1)
value = self._get_attribute(attrs_d, 'rdf:resource') value = self._get_attribute(attrs_d, "rdf:resource")
if value: if value:
self.elementstack[-1][2].append(value) self.elementstack[-1][2].append(value)
self.pop('errorreportsto') self.pop("errorreportsto")

View file

@ -1,5 +1,5 @@
# Support for the Creative Commons licensing extensions # Support for the Creative Commons licensing extensions
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -29,41 +29,42 @@
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
supported_namespaces = { supported_namespaces = {
# RDF-based namespace # RDF-based namespace
'http://creativecommons.org/ns#license': 'cc', "http://creativecommons.org/ns#license": "cc",
# Old RDF-based namespace # Old RDF-based namespace
'http://web.resource.org/cc/': 'cc', "http://web.resource.org/cc/": "cc",
# RSS-based namespace # RSS-based namespace
'http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html': 'creativecommons', "http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": (
"creativecommons"
),
# Old RSS-based namespace # Old RSS-based namespace
'http://backend.userland.com/creativeCommonsRssModule': 'creativecommons', "http://backend.userland.com/creativeCommonsRssModule": "creativecommons",
} }
def _start_cc_license(self, attrs_d): def _start_cc_license(self, attrs_d):
context = self._get_context() context = self._get_context()
value = self._get_attribute(attrs_d, 'rdf:resource') value = self._get_attribute(attrs_d, "rdf:resource")
attrs_d = FeedParserDict() attrs_d = FeedParserDict()
attrs_d['rel'] = 'license' attrs_d["rel"] = "license"
if value: if value:
attrs_d['href'] = value attrs_d["href"] = value
context.setdefault('links', []).append(attrs_d) context.setdefault("links", []).append(attrs_d)
def _start_creativecommons_license(self, attrs_d): def _start_creativecommons_license(self, attrs_d):
self.push('license', 1) self.push("license", 1)
_start_creativeCommons_license = _start_creativecommons_license _start_creativeCommons_license = _start_creativecommons_license
def _end_creativecommons_license(self): def _end_creativecommons_license(self):
value = self.pop('license') value = self.pop("license")
context = self._get_context() context = self._get_context()
attrs_d = FeedParserDict() attrs_d = FeedParserDict()
attrs_d['rel'] = 'license' attrs_d["rel"] = "license"
if value: if value:
attrs_d['href'] = value attrs_d["href"] = value
context.setdefault('links', []).append(attrs_d) context.setdefault("links", []).append(attrs_d)
del context['license'] del context["license"]
_end_creativeCommons_license = _end_creativecommons_license _end_creativeCommons_license = _end_creativecommons_license

View file

@ -1,5 +1,5 @@
# Support for the Dublin Core metadata extensions # Support for the Dublin Core metadata extensions
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -30,10 +30,10 @@ from ..datetimes import _parse_date
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
supported_namespaces = { supported_namespaces = {
'http://purl.org/dc/elements/1.1/': 'dc', "http://purl.org/dc/elements/1.1/": "dc",
'http://purl.org/dc/terms/': 'dcterms', "http://purl.org/dc/terms/": "dcterms",
} }
def _end_dc_author(self): def _end_dc_author(self):
@ -109,25 +109,29 @@ class Namespace(object):
self._start_updated(attrs_d) self._start_updated(attrs_d)
def _start_dcterms_valid(self, attrs_d): def _start_dcterms_valid(self, attrs_d):
self.push('validity', 1) self.push("validity", 1)
def _end_dcterms_valid(self): def _end_dcterms_valid(self):
for validity_detail in self.pop('validity').split(';'): for validity_detail in self.pop("validity").split(";"):
if '=' in validity_detail: if "=" in validity_detail:
key, value = validity_detail.split('=', 1) key, value = validity_detail.split("=", 1)
if key == 'start': if key == "start":
self._save('validity_start', value, overwrite=True) self._save("validity_start", value, overwrite=True)
self._save('validity_start_parsed', _parse_date(value), overwrite=True) self._save(
elif key == 'end': "validity_start_parsed", _parse_date(value), overwrite=True
self._save('validity_end', value, overwrite=True) )
self._save('validity_end_parsed', _parse_date(value), overwrite=True) elif key == "end":
self._save("validity_end", value, overwrite=True)
self._save(
"validity_end_parsed", _parse_date(value), overwrite=True
)
def _start_dc_contributor(self, attrs_d): def _start_dc_contributor(self, attrs_d):
self.incontributor = 1 self.incontributor = 1
context = self._get_context() context = self._get_context()
context.setdefault('contributors', []) context.setdefault("contributors", [])
context['contributors'].append(FeedParserDict()) context["contributors"].append(FeedParserDict())
self.push('name', 0) self.push("name", 0)
def _end_dc_contributor(self): def _end_dc_contributor(self):
self._end_name() self._end_name()

View file

@ -1,5 +1,5 @@
# Support for the GeoRSS format # Support for the GeoRSS format
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,27 +26,24 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
# Required for Python 3.6 compatibility.
from __future__ import generator_stop
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
supported_namespaces = { supported_namespaces = {
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo",
'http://www.georss.org/georss': 'georss', "http://www.georss.org/georss": "georss",
'http://www.opengis.net/gml': 'gml', "http://www.opengis.net/gml": "gml",
} }
def __init__(self): def __init__(self):
self.ingeometry = 0 self.ingeometry = 0
super(Namespace, self).__init__() super().__init__()
def _start_georssgeom(self, attrs_d): def _start_georssgeom(self, attrs_d):
self.push('geometry', 0) self.push("geometry", 0)
context = self._get_context() context = self._get_context()
context['where'] = FeedParserDict() context["where"] = FeedParserDict()
_start_georss_point = _start_georssgeom _start_georss_point = _start_georssgeom
_start_georss_line = _start_georssgeom _start_georss_line = _start_georssgeom
@ -55,76 +52,77 @@ class Namespace(object):
def _save_where(self, geometry): def _save_where(self, geometry):
context = self._get_context() context = self._get_context()
context['where'].update(geometry) context["where"].update(geometry)
def _end_georss_point(self): def _end_georss_point(self):
geometry = _parse_georss_point(self.pop('geometry')) geometry = _parse_georss_point(self.pop("geometry"))
if geometry: if geometry:
self._save_where(geometry) self._save_where(geometry)
def _end_georss_line(self): def _end_georss_line(self):
geometry = _parse_georss_line(self.pop('geometry')) geometry = _parse_georss_line(self.pop("geometry"))
if geometry: if geometry:
self._save_where(geometry) self._save_where(geometry)
def _end_georss_polygon(self): def _end_georss_polygon(self):
this = self.pop('geometry') this = self.pop("geometry")
geometry = _parse_georss_polygon(this) geometry = _parse_georss_polygon(this)
if geometry: if geometry:
self._save_where(geometry) self._save_where(geometry)
def _end_georss_box(self): def _end_georss_box(self):
geometry = _parse_georss_box(self.pop('geometry')) geometry = _parse_georss_box(self.pop("geometry"))
if geometry: if geometry:
self._save_where(geometry) self._save_where(geometry)
def _start_where(self, attrs_d): def _start_where(self, attrs_d):
self.push('where', 0) self.push("where", 0)
context = self._get_context() context = self._get_context()
context['where'] = FeedParserDict() context["where"] = FeedParserDict()
_start_georss_where = _start_where _start_georss_where = _start_where
def _parse_srs_attrs(self, attrs_d): def _parse_srs_attrs(self, attrs_d):
srs_name = attrs_d.get('srsname') srs_name = attrs_d.get("srsname")
try: try:
srs_dimension = int(attrs_d.get('srsdimension', '2')) srs_dimension = int(attrs_d.get("srsdimension", "2"))
except ValueError: except ValueError:
srs_dimension = 2 srs_dimension = 2
context = self._get_context() context = self._get_context()
if 'where' not in context: if "where" not in context:
context['where'] = {} context["where"] = {}
context['where']['srsName'] = srs_name context["where"]["srsName"] = srs_name
context['where']['srsDimension'] = srs_dimension context["where"]["srsDimension"] = srs_dimension
def _start_gml_point(self, attrs_d): def _start_gml_point(self, attrs_d):
self._parse_srs_attrs(attrs_d) self._parse_srs_attrs(attrs_d)
self.ingeometry = 1 self.ingeometry = 1
self.push('geometry', 0) self.push("geometry", 0)
def _start_gml_linestring(self, attrs_d): def _start_gml_linestring(self, attrs_d):
self._parse_srs_attrs(attrs_d) self._parse_srs_attrs(attrs_d)
self.ingeometry = 'linestring' self.ingeometry = "linestring"
self.push('geometry', 0) self.push("geometry", 0)
def _start_gml_polygon(self, attrs_d): def _start_gml_polygon(self, attrs_d):
self._parse_srs_attrs(attrs_d) self._parse_srs_attrs(attrs_d)
self.push('geometry', 0) self.push("geometry", 0)
def _start_gml_exterior(self, attrs_d): def _start_gml_exterior(self, attrs_d):
self.push('geometry', 0) self.push("geometry", 0)
def _start_gml_linearring(self, attrs_d): def _start_gml_linearring(self, attrs_d):
self.ingeometry = 'polygon' self.ingeometry = "polygon"
self.push('geometry', 0) self.push("geometry", 0)
def _start_gml_pos(self, attrs_d): def _start_gml_pos(self, attrs_d):
self.push('pos', 0) self.push("pos", 0)
def _end_gml_pos(self): def _end_gml_pos(self):
this = self.pop('pos') this = self.pop("pos")
context = self._get_context() context = self._get_context()
srs_name = context['where'].get('srsName') srs_name = context["where"].get("srsName")
srs_dimension = context['where'].get('srsDimension', 2) srs_dimension = context["where"].get("srsDimension", 2)
swap = True swap = True
if srs_name and "EPSG" in srs_name: if srs_name and "EPSG" in srs_name:
epsg = int(srs_name.split(":")[-1]) epsg = int(srs_name.split(":")[-1])
@ -134,25 +132,25 @@ class Namespace(object):
self._save_where(geometry) self._save_where(geometry)
def _start_gml_poslist(self, attrs_d): def _start_gml_poslist(self, attrs_d):
self.push('pos', 0) self.push("pos", 0)
def _end_gml_poslist(self): def _end_gml_poslist(self):
this = self.pop('pos') this = self.pop("pos")
context = self._get_context() context = self._get_context()
srs_name = context['where'].get('srsName') srs_name = context["where"].get("srsName")
srs_dimension = context['where'].get('srsDimension', 2) srs_dimension = context["where"].get("srsDimension", 2)
swap = True swap = True
if srs_name and "EPSG" in srs_name: if srs_name and "EPSG" in srs_name:
epsg = int(srs_name.split(":")[-1]) epsg = int(srs_name.split(":")[-1])
swap = bool(epsg in _geogCS) swap = bool(epsg in _geogCS)
geometry = _parse_poslist( geometry = _parse_poslist(this, self.ingeometry, swap=swap, dims=srs_dimension)
this, self.ingeometry, swap=swap, dims=srs_dimension)
if geometry: if geometry:
self._save_where(geometry) self._save_where(geometry)
def _end_geom(self): def _end_geom(self):
self.ingeometry = 0 self.ingeometry = 0
self.pop('geometry') self.pop("geometry")
_end_gml_point = _end_geom _end_gml_point = _end_geom
_end_gml_linestring = _end_geom _end_gml_linestring = _end_geom
_end_gml_linearring = _end_geom _end_gml_linearring = _end_geom
@ -160,19 +158,21 @@ class Namespace(object):
_end_gml_polygon = _end_geom _end_gml_polygon = _end_geom
def _end_where(self): def _end_where(self):
self.pop('where') self.pop("where")
_end_georss_where = _end_where _end_georss_where = _end_where
# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
# items, or None in the case of a parsing error. # items, or None in the case of a parsing error.
def _parse_poslist(value, geom_type, swap=True, dims=2): def _parse_poslist(value, geom_type, swap=True, dims=2):
if geom_type == 'linestring': if geom_type == "linestring":
return _parse_georss_line(value, swap, dims) return _parse_georss_line(value, swap, dims)
elif geom_type == 'polygon': elif geom_type == "polygon":
ring = _parse_georss_line(value, swap, dims) ring = _parse_georss_line(value, swap, dims)
return {'type': 'Polygon', 'coordinates': (ring['coordinates'],)} return {"type": "Polygon", "coordinates": (ring["coordinates"],)}
else: else:
return None return None
@ -180,10 +180,10 @@ def _parse_poslist(value, geom_type, swap=True, dims=2):
def _gen_georss_coords(value, swap=True, dims=2): def _gen_georss_coords(value, swap=True, dims=2):
# A generator of (lon, lat) pairs from a string of encoded GeoRSS # A generator of (lon, lat) pairs from a string of encoded GeoRSS
# coordinates. Converts to floats and swaps order. # coordinates. Converts to floats and swaps order.
latlons = (float(ll) for ll in value.replace(',', ' ').split()) latlons = (float(ll) for ll in value.replace(",", " ").split())
while True: while True:
try: try:
t = [next(latlons), next(latlons)][::swap and -1 or 1] t = [next(latlons), next(latlons)][:: swap and -1 or 1]
if dims == 3: if dims == 3:
t.append(next(latlons)) t.append(next(latlons))
yield tuple(t) yield tuple(t)
@ -196,7 +196,7 @@ def _parse_georss_point(value, swap=True, dims=2):
# whitespace. We'll also handle comma separators. # whitespace. We'll also handle comma separators.
try: try:
coords = list(_gen_georss_coords(value, swap, dims)) coords = list(_gen_georss_coords(value, swap, dims))
return {'type': 'Point', 'coordinates': coords[0]} return {"type": "Point", "coordinates": coords[0]}
except (IndexError, ValueError): except (IndexError, ValueError):
return None return None
@ -207,7 +207,7 @@ def _parse_georss_line(value, swap=True, dims=2):
# whitespace. There must be at least two pairs. # whitespace. There must be at least two pairs.
try: try:
coords = list(_gen_georss_coords(value, swap, dims)) coords = list(_gen_georss_coords(value, swap, dims))
return {'type': 'LineString', 'coordinates': coords} return {"type": "LineString", "coordinates": coords}
except (IndexError, ValueError): except (IndexError, ValueError):
return None return None
@ -223,7 +223,7 @@ def _parse_georss_polygon(value, swap=True, dims=2):
return None return None
if len(ring) < 4: if len(ring) < 4:
return None return None
return {'type': 'Polygon', 'coordinates': (ring,)} return {"type": "Polygon", "coordinates": (ring,)}
def _parse_georss_box(value, swap=True, dims=2): def _parse_georss_box(value, swap=True, dims=2):
@ -233,7 +233,7 @@ def _parse_georss_box(value, swap=True, dims=2):
# first pair is the lower corner, the second is the upper corner. # first pair is the lower corner, the second is the upper corner.
try: try:
coords = list(_gen_georss_coords(value, swap, dims)) coords = list(_gen_georss_coords(value, swap, dims))
return {'type': 'Box', 'coordinates': tuple(coords)} return {"type": "Box", "coordinates": tuple(coords)}
except (IndexError, ValueError): except (IndexError, ValueError):
return None return None
@ -241,38 +241,443 @@ def _parse_georss_box(value, swap=True, dims=2):
# The list of EPSG codes for geographic (latitude/longitude) coordinate # The list of EPSG codes for geographic (latitude/longitude) coordinate
# systems to support decoding of GeoRSS GML profiles. # systems to support decoding of GeoRSS GML profiles.
_geogCS = [ _geogCS = [
3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, 3819,
4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022, 3821,
4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, 3824,
4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081, 3889,
4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, 3906,
4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, 4001,
4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, 4002,
4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, 4003,
4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, 4004,
4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, 4005,
4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, 4006,
4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, 4007,
4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, 4008,
4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253, 4009,
4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, 4010,
4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279, 4011,
4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293, 4012,
4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307, 4013,
4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322, 4014,
4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603, 4015,
4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616, 4016,
4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629, 4018,
4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642, 4019,
4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665, 4020,
4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678, 4021,
4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691, 4022,
4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704, 4023,
4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717, 4024,
4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730, 4025,
4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743, 4027,
4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756, 4028,
4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804, 4029,
4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818, 4030,
4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979, 4031,
4032,
4033,
4034,
4035,
4036,
4041,
4042,
4043,
4044,
4045,
4046,
4047,
4052,
4053,
4054,
4055,
4075,
4081,
4120,
4121,
4122,
4123,
4124,
4125,
4126,
4127,
4128,
4129,
4130,
4131,
4132,
4133,
4134,
4135,
4136,
4137,
4138,
4139,
4140,
4141,
4142,
4143,
4144,
4145,
4146,
4147,
4148,
4149,
4150,
4151,
4152,
4153,
4154,
4155,
4156,
4157,
4158,
4159,
4160,
4161,
4162,
4163,
4164,
4165,
4166,
4167,
4168,
4169,
4170,
4171,
4172,
4173,
4174,
4175,
4176,
4178,
4179,
4180,
4181,
4182,
4183,
4184,
4185,
4188,
4189,
4190,
4191,
4192,
4193,
4194,
4195,
4196,
4197,
4198,
4199,
4200,
4201,
4202,
4203,
4204,
4205,
4206,
4207,
4208,
4209,
4210,
4211,
4212,
4213,
4214,
4215,
4216,
4218,
4219,
4220,
4221,
4222,
4223,
4224,
4225,
4226,
4227,
4228,
4229,
4230,
4231,
4232,
4233,
4234,
4235,
4236,
4237,
4238,
4239,
4240,
4241,
4242,
4243,
4244,
4245,
4246,
4247,
4248,
4249,
4250,
4251,
4252,
4253,
4254,
4255,
4256,
4257,
4258,
4259,
4260,
4261,
4262,
4263,
4264,
4265,
4266,
4267,
4268,
4269,
4270,
4271,
4272,
4273,
4274,
4275,
4276,
4277,
4278,
4279,
4280,
4281,
4282,
4283,
4284,
4285,
4286,
4287,
4288,
4289,
4291,
4292,
4293,
4294,
4295,
4296,
4297,
4298,
4299,
4300,
4301,
4302,
4303,
4304,
4306,
4307,
4308,
4309,
4310,
4311,
4312,
4313,
4314,
4315,
4316,
4317,
4318,
4319,
4322,
4324,
4326,
4463,
4470,
4475,
4483,
4490,
4555,
4558,
4600,
4601,
4602,
4603,
4604,
4605,
4606,
4607,
4608,
4609,
4610,
4611,
4612,
4613,
4614,
4615,
4616,
4617,
4618,
4619,
4620,
4621,
4622,
4623,
4624,
4625,
4626,
4627,
4628,
4629,
4630,
4631,
4632,
4633,
4634,
4635,
4636,
4637,
4638,
4639,
4640,
4641,
4642,
4643,
4644,
4645,
4646,
4657,
4658,
4659,
4660,
4661,
4662,
4663,
4664,
4665,
4666,
4667,
4668,
4669,
4670,
4671,
4672,
4673,
4674,
4675,
4676,
4677,
4678,
4679,
4680,
4681,
4682,
4683,
4684,
4685,
4686,
4687,
4688,
4689,
4690,
4691,
4692,
4693,
4694,
4695,
4696,
4697,
4698,
4699,
4700,
4701,
4702,
4703,
4704,
4705,
4706,
4707,
4708,
4709,
4710,
4711,
4712,
4713,
4714,
4715,
4716,
4717,
4718,
4719,
4720,
4721,
4722,
4723,
4724,
4725,
4726,
4727,
4728,
4729,
4730,
4731,
4732,
4733,
4734,
4735,
4736,
4737,
4738,
4739,
4740,
4741,
4742,
4743,
4744,
4745,
4746,
4747,
4748,
4749,
4750,
4751,
4752,
4753,
4754,
4755,
4756,
4757,
4758,
4759,
4760,
4761,
4762,
4763,
4764,
4765,
4801,
4802,
4803,
4804,
4805,
4806,
4807,
4808,
4809,
4810,
4811,
4813,
4814,
4815,
4816,
4817,
4818,
4819,
4820,
4821,
4823,
4824,
4901,
4902,
4903,
4904,
4979,
] ]

View file

@ -1,5 +1,5 @@
# Support for the iTunes format # Support for the iTunes format
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -29,13 +29,12 @@
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
supported_namespaces = { supported_namespaces = {
# Canonical namespace # Canonical namespace
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes",
# Extra namespace # Extra namespace
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', "http://example.com/DTDs/PodCast-1.0.dtd": "itunes",
} }
def _start_itunes_author(self, attrs_d): def _start_itunes_author(self, attrs_d):
@ -73,37 +72,42 @@ class Namespace(object):
def _start_itunes_owner(self, attrs_d): def _start_itunes_owner(self, attrs_d):
self.inpublisher = 1 self.inpublisher = 1
self.push('publisher', 0) self.push("publisher", 0)
def _end_itunes_owner(self): def _end_itunes_owner(self):
self.pop('publisher') self.pop("publisher")
self.inpublisher = 0 self.inpublisher = 0
self._sync_author_detail('publisher') self._sync_author_detail("publisher")
def _end_itunes_keywords(self): def _end_itunes_keywords(self):
for term in self.pop('itunes_keywords').split(','): for term in self.pop("itunes_keywords").split(","):
if term.strip(): if term.strip():
self._add_tag(term.strip(), 'http://www.itunes.com/', None) self._add_tag(term.strip(), "http://www.itunes.com/", None)
def _start_itunes_category(self, attrs_d): def _start_itunes_category(self, attrs_d):
self._add_tag(attrs_d.get('text'), 'http://www.itunes.com/', None) self._add_tag(attrs_d.get("text"), "http://www.itunes.com/", None)
self.push('category', 1) self.push("category", 1)
def _start_itunes_image(self, attrs_d): def _start_itunes_image(self, attrs_d):
self.push('itunes_image', 0) self.push("itunes_image", 0)
if attrs_d.get('href'): if attrs_d.get("href"):
self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('href')}) self._get_context()["image"] = FeedParserDict({"href": attrs_d.get("href")})
elif attrs_d.get('url'): elif attrs_d.get("url"):
self._get_context()['image'] = FeedParserDict({'href': attrs_d.get('url')}) self._get_context()["image"] = FeedParserDict({"href": attrs_d.get("url")})
_start_itunes_link = _start_itunes_image _start_itunes_link = _start_itunes_image
def _end_itunes_block(self): def _end_itunes_block(self):
value = self.pop('itunes_block', 0) value = self.pop("itunes_block", 0)
self._get_context()['itunes_block'] = (value == 'yes' or value == 'Yes') and 1 or 0 self._get_context()["itunes_block"] = (
(value == "yes" or value == "Yes") and 1 or 0
)
def _end_itunes_explicit(self): def _end_itunes_explicit(self):
value = self.pop('itunes_explicit', 0) value = self.pop("itunes_explicit", 0)
# Convert 'yes' -> True, 'clean' to False, and any other value to None # Convert 'yes' -> True, 'clean' to False, and any other value to None
# False and None both evaluate as False, so the difference can be ignored # False and None both evaluate as False, so the difference can be ignored
# by applications that only need to know if the content is explicit. # by applications that only need to know if the content is explicit.
self._get_context()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] self._get_context()["itunes_explicit"] = (None, False, True)[
(value == "yes" and 2) or value == "clean" or 0
]

View file

@ -1,5 +1,5 @@
# Support for the Media RSS format # Support for the Media RSS format
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -29,24 +29,23 @@
from ..util import FeedParserDict from ..util import FeedParserDict
class Namespace(object): class Namespace:
supported_namespaces = { supported_namespaces = {
# Canonical namespace # Canonical namespace
'http://search.yahoo.com/mrss/': 'media', "http://search.yahoo.com/mrss/": "media",
# Old namespace (no trailing slash) # Old namespace (no trailing slash)
'http://search.yahoo.com/mrss': 'media', "http://search.yahoo.com/mrss": "media",
} }
def _start_media_category(self, attrs_d): def _start_media_category(self, attrs_d):
attrs_d.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema') attrs_d.setdefault("scheme", "http://search.yahoo.com/mrss/category_schema")
self._start_category(attrs_d) self._start_category(attrs_d)
def _end_media_category(self): def _end_media_category(self):
self._end_category() self._end_category()
def _end_media_keywords(self): def _end_media_keywords(self):
for term in self.pop('media_keywords').split(','): for term in self.pop("media_keywords").split(","):
if term.strip(): if term.strip():
self._add_tag(term.strip(), None, None) self._add_tag(term.strip(), None, None)
@ -64,26 +63,26 @@ class Namespace(object):
def _start_media_rating(self, attrs_d): def _start_media_rating(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('media_rating', attrs_d) context.setdefault("media_rating", attrs_d)
self.push('rating', 1) self.push("rating", 1)
def _end_media_rating(self): def _end_media_rating(self):
rating = self.pop('rating') rating = self.pop("rating")
if rating is not None and rating.strip(): if rating is not None and rating.strip():
context = self._get_context() context = self._get_context()
context['media_rating']['content'] = rating context["media_rating"]["content"] = rating
def _start_media_credit(self, attrs_d): def _start_media_credit(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('media_credit', []) context.setdefault("media_credit", [])
context['media_credit'].append(attrs_d) context["media_credit"].append(attrs_d)
self.push('credit', 1) self.push("credit", 1)
def _end_media_credit(self): def _end_media_credit(self):
credit = self.pop('credit') credit = self.pop("credit")
if credit is not None and credit.strip(): if credit is not None and credit.strip():
context = self._get_context() context = self._get_context()
context['media_credit'][-1]['content'] = credit context["media_credit"][-1]["content"] = credit
def _start_media_description(self, attrs_d): def _start_media_description(self, attrs_d):
self._start_description(attrs_d) self._start_description(attrs_d)
@ -93,49 +92,51 @@ class Namespace(object):
def _start_media_restriction(self, attrs_d): def _start_media_restriction(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('media_restriction', attrs_d) context.setdefault("media_restriction", attrs_d)
self.push('restriction', 1) self.push("restriction", 1)
def _end_media_restriction(self): def _end_media_restriction(self):
restriction = self.pop('restriction') restriction = self.pop("restriction")
if restriction is not None and restriction.strip(): if restriction is not None and restriction.strip():
context = self._get_context() context = self._get_context()
context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')] context["media_restriction"]["content"] = [
cc.strip().lower() for cc in restriction.split(" ")
]
def _start_media_license(self, attrs_d): def _start_media_license(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('media_license', attrs_d) context.setdefault("media_license", attrs_d)
self.push('license', 1) self.push("license", 1)
def _end_media_license(self): def _end_media_license(self):
license_ = self.pop('license') license_ = self.pop("license")
if license_ is not None and license_.strip(): if license_ is not None and license_.strip():
context = self._get_context() context = self._get_context()
context['media_license']['content'] = license_ context["media_license"]["content"] = license_
def _start_media_content(self, attrs_d): def _start_media_content(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('media_content', []) context.setdefault("media_content", [])
context['media_content'].append(attrs_d) context["media_content"].append(attrs_d)
def _start_media_thumbnail(self, attrs_d): def _start_media_thumbnail(self, attrs_d):
context = self._get_context() context = self._get_context()
context.setdefault('media_thumbnail', []) context.setdefault("media_thumbnail", [])
self.push('url', 1) # new self.push("url", 1) # new
context['media_thumbnail'].append(attrs_d) context["media_thumbnail"].append(attrs_d)
def _end_media_thumbnail(self): def _end_media_thumbnail(self):
url = self.pop('url') url = self.pop("url")
context = self._get_context() context = self._get_context()
if url is not None and url.strip(): if url is not None and url.strip():
if 'url' not in context['media_thumbnail'][-1]: if "url" not in context["media_thumbnail"][-1]:
context['media_thumbnail'][-1]['url'] = url context["media_thumbnail"][-1]["url"] = url
def _start_media_player(self, attrs_d): def _start_media_player(self, attrs_d):
self.push('media_player', 0) self.push("media_player", 0)
self._get_context()['media_player'] = FeedParserDict(attrs_d) self._get_context()["media_player"] = FeedParserDict(attrs_d)
def _end_media_player(self): def _end_media_player(self):
value = self.pop('media_player') value = self.pop("media_player")
context = self._get_context() context = self._get_context()
context['media_player']['content'] = value context["media_player"]["content"] = value

View file

@ -1,5 +1,5 @@
# Support for the Podlove Simple Chapters format # Support for the Podlove Simple Chapters format
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -32,36 +32,36 @@ import re
from .. import util from .. import util
class Namespace(object): class Namespace:
supported_namespaces = { supported_namespaces = {
'http://podlove.org/simple-chapters': 'psc', "http://podlove.org/simple-chapters": "psc",
} }
def __init__(self): def __init__(self):
# chapters will only be captured while psc_chapters_flag is True. # chapters will only be captured while psc_chapters_flag is True.
self.psc_chapters_flag = False self.psc_chapters_flag = False
super(Namespace, self).__init__() super().__init__()
def _start_psc_chapters(self, attrs_d): def _start_psc_chapters(self, attrs_d):
context = self._get_context() context = self._get_context()
if 'psc_chapters' not in context: if "psc_chapters" not in context:
self.psc_chapters_flag = True self.psc_chapters_flag = True
attrs_d['chapters'] = [] attrs_d["chapters"] = []
context['psc_chapters'] = util.FeedParserDict(attrs_d) context["psc_chapters"] = util.FeedParserDict(attrs_d)
def _end_psc_chapters(self): def _end_psc_chapters(self):
self.psc_chapters_flag = False self.psc_chapters_flag = False
def _start_psc_chapter(self, attrs_d): def _start_psc_chapter(self, attrs_d):
if self.psc_chapters_flag: if self.psc_chapters_flag:
start = self._get_attribute(attrs_d, 'start') start = self._get_attribute(attrs_d, "start")
attrs_d['start_parsed'] = _parse_psc_chapter_start(start) attrs_d["start_parsed"] = _parse_psc_chapter_start(start)
context = self._get_context()['psc_chapters'] context = self._get_context()["psc_chapters"]
context['chapters'].append(util.FeedParserDict(attrs_d)) context["chapters"].append(util.FeedParserDict(attrs_d))
format_ = re.compile(r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$') format_ = re.compile(r"^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$")
def _parse_psc_chapter_start(start): def _parse_psc_chapter_start(start):
@ -71,4 +71,4 @@ def _parse_psc_chapter_start(start):
_, h, m, s, _, ms = m.groups() _, h, m, s, _, ms = m.groups()
h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0)) h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000) return datetime.timedelta(0, h * 60 * 60 + m * 60 + s, ms * 1000)

View file

@ -34,37 +34,37 @@ from ..util import FeedParserDict
class JSONParser: class JSONParser:
VERSIONS = { VERSIONS = {
'https://jsonfeed.org/version/1': 'json1', "https://jsonfeed.org/version/1": "json1",
'https://jsonfeed.org/version/1.1': 'json11', "https://jsonfeed.org/version/1.1": "json11",
} }
FEED_FIELDS = ( FEED_FIELDS = (
('title', 'title'), ("title", "title"),
('icon', 'image'), ("icon", "image"),
('home_page_url', 'link'), ("home_page_url", "link"),
('description', 'description'), ("description", "description"),
) )
ITEM_FIELDS = ( ITEM_FIELDS = (
('title', 'title'), ("title", "title"),
('id', 'guid'), ("id", "guid"),
('url', 'link'), ("url", "link"),
('summary', 'summary'), ("summary", "summary"),
('external_url', 'source'), ("external_url", "source"),
) )
def __init__(self, baseuri=None, baselang=None, encoding=None): def __init__(self, baseuri=None, baselang=None, encoding=None):
self.baseuri = baseuri or '' self.baseuri = baseuri or ""
self.lang = baselang or None self.lang = baselang or None
self.encoding = encoding or 'utf-8' # character encoding self.encoding = encoding or "utf-8" # character encoding
self.version = None self.version = None
self.feeddata = FeedParserDict() self.feeddata = FeedParserDict()
self.namespacesInUse = [] self.namespacesInUse = []
self.entries = [] self.entries = []
def feed(self, data): def feed(self, file):
data = json.loads(data) data = json.load(file)
v = data.get('version', '') v = data.get("version", "")
try: try:
self.version = self.VERSIONS[v] self.version = self.VERSIONS[v]
except KeyError: except KeyError:
@ -73,11 +73,11 @@ class JSONParser:
for src, dst in self.FEED_FIELDS: for src, dst in self.FEED_FIELDS:
if src in data: if src in data:
self.feeddata[dst] = data[src] self.feeddata[dst] = data[src]
if 'author' in data: if "author" in data:
self.parse_author(data['author'], self.feeddata) self.parse_author(data["author"], self.feeddata)
# TODO: hubs; expired has no RSS equivalent # TODO: hubs; expired has no RSS equivalent
self.entries = [self.parse_entry(e) for e in data['items']] self.entries = [self.parse_entry(e) for e in data["items"]]
def parse_entry(self, e): def parse_entry(self, e):
entry = FeedParserDict() entry = FeedParserDict()
@ -85,49 +85,51 @@ class JSONParser:
if src in e: if src in e:
entry[dst] = e[src] entry[dst] = e[src]
if 'content_text' in e: if "content_text" in e:
entry['content'] = c = FeedParserDict() entry["content"] = c = FeedParserDict()
c['value'] = e['content_text'] c["value"] = e["content_text"]
c['type'] = 'text' c["type"] = "text"
elif 'content_html' in e: elif "content_html" in e:
entry['content'] = c = FeedParserDict() entry["content"] = c = FeedParserDict()
c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json') c["value"] = sanitize_html(
c['type'] = 'html' e["content_html"], self.encoding, "application/json"
)
c["type"] = "html"
if 'date_published' in e: if "date_published" in e:
entry['published'] = e['date_published'] entry["published"] = e["date_published"]
entry['published_parsed'] = _parse_date(e['date_published']) entry["published_parsed"] = _parse_date(e["date_published"])
if 'date_updated' in e: if "date_updated" in e:
entry['updated'] = e['date_modified'] entry["updated"] = e["date_modified"]
entry['updated_parsed'] = _parse_date(e['date_modified']) entry["updated_parsed"] = _parse_date(e["date_modified"])
if 'tags' in e: if "tags" in e:
entry['category'] = e['tags'] entry["category"] = e["tags"]
if 'author' in e: if "author" in e:
self.parse_author(e['author'], entry) self.parse_author(e["author"], entry)
if 'attachments' in e: if "attachments" in e:
entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']] entry["enclosures"] = [self.parse_attachment(a) for a in e["attachments"]]
return entry return entry
@staticmethod @staticmethod
def parse_author(parent, dest): def parse_author(parent, dest):
dest['author_detail'] = detail = FeedParserDict() dest["author_detail"] = detail = FeedParserDict()
if 'name' in parent: if "name" in parent:
dest['author'] = detail['name'] = parent['name'] dest["author"] = detail["name"] = parent["name"]
if 'url' in parent: if "url" in parent:
if parent['url'].startswith('mailto:'): if parent["url"].startswith("mailto:"):
detail['email'] = parent['url'][7:] detail["email"] = parent["url"][7:]
else: else:
detail['href'] = parent['url'] detail["href"] = parent["url"]
@staticmethod @staticmethod
def parse_attachment(attachment): def parse_attachment(attachment):
enc = FeedParserDict() enc = FeedParserDict()
enc['href'] = attachment['url'] enc["href"] = attachment["url"]
enc['type'] = attachment['mime_type'] enc["type"] = attachment["mime_type"]
if 'size_in_bytes' in attachment: if "size_in_bytes" in attachment:
enc['length'] = attachment['size_in_bytes'] enc["length"] = attachment["size_in_bytes"]
return enc return enc

View file

@ -1,5 +1,5 @@
# The loose feed parser that interfaces with an SGML parsing library # The loose feed parser that interfaces with an SGML parsing library
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -26,52 +26,50 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
class LooseXMLParser: class LooseXMLParser:
contentparams = None contentparams = None
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None): def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
self.baseuri = baseuri or '' self.baseuri = baseuri or ""
self.lang = baselang or None self.lang = baselang or None
self.encoding = encoding or 'utf-8' # character encoding self.encoding = encoding or "utf-8" # character encoding
self.entities = entities or {} self.entities = entities or {}
super().__init__() super().__init__()
@staticmethod @staticmethod
def _normalize_attributes(kv): def _normalize_attributes(kv):
k = kv[0].lower() k = kv[0].lower()
v = k in ('rel', 'type') and kv[1].lower() or kv[1] v = k in ("rel", "type") and kv[1].lower() or kv[1]
# the sgml parser doesn't handle entities in attributes, nor # the sgml parser doesn't handle entities in attributes, nor
# does it pass the attribute values through as unicode, while # does it pass the attribute values through as unicode, while
# strict xml parsers do -- account for this difference # strict xml parsers do -- account for this difference
v = v.replace('&amp;', '&') v = v.replace("&amp;", "&")
return k, v return k, v
def decode_entities(self, element, data): def decode_entities(self, element, data):
data = data.replace('&#60;', '&lt;') data = data.replace("&#60;", "&lt;")
data = data.replace('&#x3c;', '&lt;') data = data.replace("&#x3c;", "&lt;")
data = data.replace('&#x3C;', '&lt;') data = data.replace("&#x3C;", "&lt;")
data = data.replace('&#62;', '&gt;') data = data.replace("&#62;", "&gt;")
data = data.replace('&#x3e;', '&gt;') data = data.replace("&#x3e;", "&gt;")
data = data.replace('&#x3E;', '&gt;') data = data.replace("&#x3E;", "&gt;")
data = data.replace('&#38;', '&amp;') data = data.replace("&#38;", "&amp;")
data = data.replace('&#x26;', '&amp;') data = data.replace("&#x26;", "&amp;")
data = data.replace('&#34;', '&quot;') data = data.replace("&#34;", "&quot;")
data = data.replace('&#x22;', '&quot;') data = data.replace("&#x22;", "&quot;")
data = data.replace('&#39;', '&apos;') data = data.replace("&#39;", "&apos;")
data = data.replace('&#x27;', '&apos;') data = data.replace("&#x27;", "&apos;")
if not self.contentparams.get('type', 'xml').endswith('xml'): if not self.contentparams.get("type", "xml").endswith("xml"):
data = data.replace('&lt;', '<') data = data.replace("&lt;", "<")
data = data.replace('&gt;', '>') data = data.replace("&gt;", ">")
data = data.replace('&amp;', '&') data = data.replace("&amp;", "&")
data = data.replace('&quot;', '"') data = data.replace("&quot;", '"')
data = data.replace('&apos;', "'") data = data.replace("&apos;", "'")
data = data.replace('&#x2f;', '/') data = data.replace("&#x2f;", "/")
data = data.replace('&#x2F;', '/') data = data.replace("&#x2F;", "/")
return data return data
@staticmethod @staticmethod
def strattrs(attrs): def strattrs(attrs):
return ''.join( return "".join(' {}="{}"'.format(n, v.replace('"', "&quot;")) for n, v in attrs)
' %s="%s"' % (n, v.replace('"', '&quot;'))
for n, v in attrs
)

View file

@ -1,5 +1,5 @@
# The strict feed parser that interfaces with an XML parsing library # The strict feed parser that interfaces with an XML parsing library
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -34,15 +34,15 @@ class StrictXMLParser:
self.bozo = 0 self.bozo = 0
self.exc = None self.exc = None
self.decls = {} self.decls = {}
self.baseuri = baseuri or '' self.baseuri = baseuri or ""
self.lang = baselang self.lang = baselang
self.encoding = encoding self.encoding = encoding
super(StrictXMLParser, self).__init__() super().__init__()
@staticmethod @staticmethod
def _normalize_attributes(kv): def _normalize_attributes(kv):
k = kv[0].lower() k = kv[0].lower()
v = k in ('rel', 'type') and kv[1].lower() or kv[1] v = k in ("rel", "type") and kv[1].lower() or kv[1]
return k, v return k, v
def startPrefixMapping(self, prefix, uri): def startPrefixMapping(self, prefix, uri):
@ -51,23 +51,29 @@ class StrictXMLParser:
# Jython uses '' instead of None; standardize on None # Jython uses '' instead of None; standardize on None
prefix = prefix or None prefix = prefix or None
self.track_namespace(prefix, uri) self.track_namespace(prefix, uri)
if prefix and uri == 'http://www.w3.org/1999/xlink': if prefix and uri == "http://www.w3.org/1999/xlink":
self.decls['xmlns:' + prefix] = uri self.decls["xmlns:" + prefix] = uri
def startElementNS(self, name, qname, attrs): def startElementNS(self, name, qname, attrs):
namespace, localname = name namespace, localname = name
lowernamespace = str(namespace or '').lower() lowernamespace = str(namespace or "").lower()
if lowernamespace.find('backend.userland.com/rss') != -1: if lowernamespace.find("backend.userland.com/rss") != -1:
# match any backend.userland.com namespace # match any backend.userland.com namespace
namespace = 'http://backend.userland.com/rss' namespace = "http://backend.userland.com/rss"
lowernamespace = namespace lowernamespace = namespace
if qname and qname.find(':') > 0: if qname and qname.find(":") > 0:
givenprefix = qname.split(':')[0] givenprefix = qname.split(":")[0]
else: else:
givenprefix = None givenprefix = None
prefix = self._matchnamespaces.get(lowernamespace, givenprefix) prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if givenprefix and (prefix is None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespaces_in_use: if (
raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) givenprefix
and (prefix is None or (prefix == "" and lowernamespace == ""))
and givenprefix not in self.namespaces_in_use
):
raise UndeclaredNamespace(
"'%s' is not associated with a namespace" % givenprefix
)
localname = str(localname).lower() localname = str(localname).lower()
# qname implementation is horribly broken in Python 2.1 (it # qname implementation is horribly broken in Python 2.1 (it
@ -78,24 +84,24 @@ class StrictXMLParser:
# at all). Thanks to MatejC for helping me test this and # at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet. # tirelessly telling me that it didn't work yet.
attrsD, self.decls = self.decls, {} attrsD, self.decls = self.decls, {}
if localname == 'math' and namespace == 'http://www.w3.org/1998/Math/MathML': if localname == "math" and namespace == "http://www.w3.org/1998/Math/MathML":
attrsD['xmlns'] = namespace attrsD["xmlns"] = namespace
if localname == 'svg' and namespace == 'http://www.w3.org/2000/svg': if localname == "svg" and namespace == "http://www.w3.org/2000/svg":
attrsD['xmlns'] = namespace attrsD["xmlns"] = namespace
if prefix: if prefix:
localname = prefix.lower() + ':' + localname localname = prefix.lower() + ":" + localname
elif namespace and not qname: # Expat elif namespace and not qname: # Expat
for name, value in self.namespaces_in_use.items(): for name, value in self.namespaces_in_use.items():
if name and value == namespace: if name and value == namespace:
localname = name + ':' + localname localname = name + ":" + localname
break break
for (namespace, attrlocalname), attrvalue in attrs.items(): for (namespace, attrlocalname), attrvalue in attrs.items():
lowernamespace = (namespace or '').lower() lowernamespace = (namespace or "").lower()
prefix = self._matchnamespaces.get(lowernamespace, '') prefix = self._matchnamespaces.get(lowernamespace, "")
if prefix: if prefix:
attrlocalname = prefix + ':' + attrlocalname attrlocalname = prefix + ":" + attrlocalname
attrsD[str(attrlocalname).lower()] = attrvalue attrsD[str(attrlocalname).lower()] = attrvalue
for qname in attrs.getQNames(): for qname in attrs.getQNames():
attrsD[str(qname).lower()] = attrs.getValueByQName(qname) attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
@ -107,18 +113,18 @@ class StrictXMLParser:
def endElementNS(self, name, qname): def endElementNS(self, name, qname):
namespace, localname = name namespace, localname = name
lowernamespace = str(namespace or '').lower() lowernamespace = str(namespace or "").lower()
if qname and qname.find(':') > 0: if qname and qname.find(":") > 0:
givenprefix = qname.split(':')[0] givenprefix = qname.split(":")[0]
else: else:
givenprefix = '' givenprefix = ""
prefix = self._matchnamespaces.get(lowernamespace, givenprefix) prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
if prefix: if prefix:
localname = prefix + ':' + localname localname = prefix + ":" + localname
elif namespace and not qname: # Expat elif namespace and not qname: # Expat
for name, value in self.namespaces_in_use.items(): for name, value in self.namespaces_in_use.items():
if name and value == namespace: if name and value == namespace:
localname = name + ':' + localname localname = name + ":" + localname
break break
localname = str(localname).lower() localname = str(localname).lower()
self.unknown_endtag(localname) self.unknown_endtag(localname)

0
lib/feedparser/py.typed Normal file
View file

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -27,20 +27,20 @@
import re import re
import sgmllib # type: ignore[import] import sgmllib3k as sgmllib
__all__ = [ __all__ = [
'sgmllib', "sgmllib",
'charref', "charref",
'tagfind', "tagfind",
'attrfind', "attrfind",
'entityref', "entityref",
'incomplete', "incomplete",
'interesting', "interesting",
'shorttag', "shorttag",
'shorttagopen', "shorttagopen",
'starttagopen', "starttagopen",
'endbracket', "endbracket",
] ]
# sgmllib defines a number of module-level regular expressions that are # sgmllib defines a number of module-level regular expressions that are
@ -49,20 +49,20 @@ __all__ = [
# names, and the compiled code objects of several sgmllib.SGMLParser # names, and the compiled code objects of several sgmllib.SGMLParser
# methods are copied into _BaseHTMLProcessor so that they execute in # methods are copied into _BaseHTMLProcessor so that they execute in
# feedparser's scope instead of sgmllib's scope. # feedparser's scope instead of sgmllib's scope.
charref = re.compile(r'&#(\d+|[xX][0-9a-fA-F]+);') charref = re.compile(r"&#(\d+|[xX][0-9a-fA-F]+);")
tagfind = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*') tagfind = re.compile(r"[a-zA-Z][-_.:a-zA-Z0-9]*")
attrfind = re.compile( attrfind = re.compile(
r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*""" r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*"""
r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?""" r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?"""
) )
# Unfortunately, these must be copied over to prevent NameError exceptions # Unfortunately, these must be copied over to prevent NameError exceptions
entityref = sgmllib.entityref entityref = sgmllib.SGMLParser.entityref
incomplete = sgmllib.incomplete incomplete = sgmllib.SGMLParser.incomplete
interesting = sgmllib.interesting interesting = sgmllib.SGMLParser.interesting
shorttag = sgmllib.shorttag shorttag = sgmllib.SGMLParser.shorttag
shorttagopen = sgmllib.shorttagopen shorttagopen = sgmllib.SGMLParser.shorttagopen
starttagopen = sgmllib.starttagopen starttagopen = sgmllib.SGMLParser.starttagopen
class _EndBracketRegEx: class _EndBracketRegEx:
@ -70,12 +70,12 @@ class _EndBracketRegEx:
# Overriding the built-in sgmllib.endbracket regex allows the # Overriding the built-in sgmllib.endbracket regex allows the
# parser to find angle brackets embedded in element attributes. # parser to find angle brackets embedded in element attributes.
self.endbracket = re.compile( self.endbracket = re.compile(
r'(' r"("
r"""[^'"<>]""" r"""[^'"<>]"""
r"""|"[^"]*"(?=>|/|\s|\w+=)""" r"""|"[^"]*"(?=>|/|\s|\w+=)"""
r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])""" r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])"""
r"""|.*?(?=[<>]""" r"""|.*?(?=[<>]"""
r')' r")"
) )
def search(self, target, index=0): def search(self, target, index=0):

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -37,103 +37,116 @@ from .html import BaseHTMLProcessor
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
# Many more will likely need to be added! # Many more will likely need to be added!
ACCEPTABLE_URI_SCHEMES = ( ACCEPTABLE_URI_SCHEMES = (
'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', "file",
'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', "ftp",
'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', "gopher",
'wais', "h323",
"hdl",
"http",
"https",
"imap",
"magnet",
"mailto",
"mms",
"news",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"snews",
"svn",
"svn+ssh",
"telnet",
"wais",
# Additional common-but-unofficial schemes # Additional common-but-unofficial schemes
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', "aim",
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', "callto",
"cvs",
"facetime",
"feed",
"git",
"gtalk",
"irc",
"ircs",
"irc6",
"itms",
"mms",
"msnim",
"skype",
"ssh",
"smb",
"svn",
"ymsg",
) )
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') _urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)")
def _urljoin(base, uri): def _urljoin(base, uri):
uri = _urifixer.sub(r'\1\3', uri) uri = _urifixer.sub(r"\1\3", uri)
try: try:
uri = urllib.parse.urljoin(base, uri) uri = urllib.parse.urljoin(base, uri)
except ValueError: except ValueError:
uri = '' uri = ""
return uri return uri
def convert_to_idn(url):
"""Convert a URL to IDN notation"""
# this function should only be called with a unicode string
# strategy: if the host cannot be encoded in ascii, then
# it'll be necessary to encode it in idn form
parts = list(urllib.parse.urlsplit(url))
try:
parts[1].encode('ascii')
except UnicodeEncodeError:
# the url needs to be converted to idn notation
host = parts[1].rsplit(':', 1)
newhost = []
port = ''
if len(host) == 2:
port = host.pop()
for h in host[0].split('.'):
newhost.append(h.encode('idna').decode('utf-8'))
parts[1] = '.'.join(newhost)
if port:
parts[1] += ':' + port
return urllib.parse.urlunsplit(parts)
else:
return url
def make_safe_absolute_uri(base, rel=None): def make_safe_absolute_uri(base, rel=None):
# bail if ACCEPTABLE_URI_SCHEMES is empty # bail if ACCEPTABLE_URI_SCHEMES is empty
if not ACCEPTABLE_URI_SCHEMES: if not ACCEPTABLE_URI_SCHEMES:
return _urljoin(base, rel or '') return _urljoin(base, rel or "")
if not base: if not base:
return rel or '' return rel or ""
if not rel: if not rel:
try: try:
scheme = urllib.parse.urlparse(base)[0] scheme = urllib.parse.urlparse(base)[0]
except ValueError: except ValueError:
return '' return ""
if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
return base return base
return '' return ""
uri = _urljoin(base, rel) uri = _urljoin(base, rel)
if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: if uri.strip().split(":", 1)[0] not in ACCEPTABLE_URI_SCHEMES:
return '' return ""
return uri return uri
class RelativeURIResolver(BaseHTMLProcessor): class RelativeURIResolver(BaseHTMLProcessor):
relative_uris = { relative_uris = {
('a', 'href'), ("a", "href"),
('applet', 'codebase'), ("applet", "codebase"),
('area', 'href'), ("area", "href"),
('audio', 'src'), ("audio", "src"),
('blockquote', 'cite'), ("blockquote", "cite"),
('body', 'background'), ("body", "background"),
('del', 'cite'), ("del", "cite"),
('form', 'action'), ("form", "action"),
('frame', 'longdesc'), ("frame", "longdesc"),
('frame', 'src'), ("frame", "src"),
('iframe', 'longdesc'), ("iframe", "longdesc"),
('iframe', 'src'), ("iframe", "src"),
('head', 'profile'), ("head", "profile"),
('img', 'longdesc'), ("img", "longdesc"),
('img', 'src'), ("img", "src"),
('img', 'usemap'), ("img", "usemap"),
('input', 'src'), ("input", "src"),
('input', 'usemap'), ("input", "usemap"),
('ins', 'cite'), ("ins", "cite"),
('link', 'href'), ("link", "href"),
('object', 'classid'), ("object", "classid"),
('object', 'codebase'), ("object", "codebase"),
('object', 'data'), ("object", "data"),
('object', 'usemap'), ("object", "usemap"),
('q', 'cite'), ("q", "cite"),
('script', 'src'), ("script", "src"),
('source', 'src'), ("source", "src"),
('video', 'poster'), ("video", "poster"),
('video', 'src'), ("video", "src"),
} }
def __init__(self, baseuri, encoding, _type): def __init__(self, baseuri, encoding, _type):
@ -145,8 +158,14 @@ class RelativeURIResolver(BaseHTMLProcessor):
def unknown_starttag(self, tag, attrs): def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs) attrs = self.normalize_attrs(attrs)
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value) for key, value in attrs] attrs = [
super(RelativeURIResolver, self).unknown_starttag(tag, attrs) (
key,
((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,
)
for key, value in attrs
]
super().unknown_starttag(tag, attrs)
def resolve_relative_uris(html_source, base_uri, encoding, type_): def resolve_relative_uris(html_source, base_uri, encoding, type_):

View file

@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org> # Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim # Copyright 2002-2008 Mark Pilgrim
# All rights reserved. # All rights reserved.
# #
@ -30,22 +30,22 @@ import warnings
class FeedParserDict(dict): class FeedParserDict(dict):
keymap = { keymap = {
'channel': 'feed', "channel": "feed",
'items': 'entries', "items": "entries",
'guid': 'id', "guid": "id",
'date': 'updated', "date": "updated",
'date_parsed': 'updated_parsed', "date_parsed": "updated_parsed",
'description': ['summary', 'subtitle'], "description": ["summary", "subtitle"],
'description_detail': ['summary_detail', 'subtitle_detail'], "description_detail": ["summary_detail", "subtitle_detail"],
'url': ['href'], "url": ["href"],
'modified': 'updated', "modified": "updated",
'modified_parsed': 'updated_parsed', "modified_parsed": "updated_parsed",
'issued': 'published', "issued": "published",
'issued_parsed': 'published_parsed', "issued_parsed": "published_parsed",
'copyright': 'rights', "copyright": "rights",
'copyright_detail': 'rights_detail', "copyright_detail": "rights_detail",
'tagline': 'subtitle', "tagline": "subtitle",
'tagline_detail': 'subtitle_detail', "tagline_detail": "subtitle_detail",
} }
def __getitem__(self, key, _stacklevel=2): def __getitem__(self, key, _stacklevel=2):
@ -53,28 +53,29 @@ class FeedParserDict(dict):
:return: A :class:`FeedParserDict`. :return: A :class:`FeedParserDict`.
""" """
if key == 'category': if key == "category":
try: try:
return dict.__getitem__(self, 'tags')[0]['term'] return dict.__getitem__(self, "tags")[0]["term"]
except IndexError: except IndexError:
raise KeyError("object doesn't have key 'category'") raise KeyError("object doesn't have key 'category'")
elif key == 'enclosures': elif key == "enclosures":
return [ return [
FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel']) FeedParserDict(
for link in dict.__getitem__(self, 'links') [(name, value) for (name, value) in link.items() if name != "rel"]
if link['rel'] == 'enclosure' )
for link in dict.__getitem__(self, "links")
if link["rel"] == "enclosure"
] ]
elif key == 'license': elif key == "license":
for link in dict.__getitem__(self, 'links'): for link in dict.__getitem__(self, "links"):
if link['rel'] == 'license' and 'href' in link: if link["rel"] == "license" and "href" in link:
return link['href'] return link["href"]
elif key == 'updated': elif key == "updated":
# Temporarily help developers out by keeping the old # Temporarily help developers out by keeping the old
# broken behavior that was reported in issue 310. # broken behavior that was reported in issue 310.
# This fix was proposed in issue 328. # This fix was proposed in issue 328.
if ( if not dict.__contains__(self, "updated") and dict.__contains__(
not dict.__contains__(self, 'updated') self, "published"
and dict.__contains__(self, 'published')
): ):
warnings.warn( warnings.warn(
"To avoid breaking existing software while " "To avoid breaking existing software while "
@ -85,12 +86,11 @@ class FeedParserDict(dict):
DeprecationWarning, DeprecationWarning,
stacklevel=_stacklevel, stacklevel=_stacklevel,
) )
return dict.__getitem__(self, 'published') return dict.__getitem__(self, "published")
return dict.__getitem__(self, 'updated') return dict.__getitem__(self, "updated")
elif key == 'updated_parsed': elif key == "updated_parsed":
if ( if not dict.__contains__(self, "updated_parsed") and dict.__contains__(
not dict.__contains__(self, 'updated_parsed') self, "published_parsed"
and dict.__contains__(self, 'published_parsed')
): ):
warnings.warn( warnings.warn(
"To avoid breaking existing software while " "To avoid breaking existing software while "
@ -101,8 +101,8 @@ class FeedParserDict(dict):
DeprecationWarning, DeprecationWarning,
stacklevel=_stacklevel, stacklevel=_stacklevel,
) )
return dict.__getitem__(self, 'published_parsed') return dict.__getitem__(self, "published_parsed")
return dict.__getitem__(self, 'updated_parsed') return dict.__getitem__(self, "updated_parsed")
else: else:
realkey = self.keymap.get(key, key) realkey = self.keymap.get(key, key)
if isinstance(realkey, list): if isinstance(realkey, list):
@ -114,7 +114,7 @@ class FeedParserDict(dict):
return dict.__getitem__(self, key) return dict.__getitem__(self, key)
def __contains__(self, key): def __contains__(self, key):
if key in ('updated', 'updated_parsed'): if key in ("updated", "updated_parsed"):
# Temporarily help developers out by keeping the old # Temporarily help developers out by keeping the old
# broken behavior that was reported in issue 310. # broken behavior that was reported in issue 310.
# This fix was proposed in issue 328. # This fix was proposed in issue 328.