mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-28 07:33:38 +00:00
Merge branch 'feature/UpdateFeedparser' into dev
This commit is contained in:
commit
b9cfd96e57
32 changed files with 391 additions and 193 deletions
|
@ -7,6 +7,7 @@
|
||||||
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
|
* Update Msgpack 1.0.0 (fa7d744) to 1.0.4 (b5acfd5)
|
||||||
* Update certifi 2022.09.24 to 2022.12.07
|
* Update certifi 2022.09.24 to 2022.12.07
|
||||||
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
|
* Update diskcache 5.1.0 (40ce0de) to 5.4.0 (1cb1425)
|
||||||
|
* Update feedparser 6.0.1 (98d189fa) to 6.0.10 (5fcb3ae)
|
||||||
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
|
* Update humanize 3.5.0 (b6b0ea5) to 4.0.0 (a1514eb)
|
||||||
* Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078)
|
* Update profilehooks module 1.12.0 (3ee1f60) to 1.12.1 (c3fc078)
|
||||||
* Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967)
|
* Update Rarfile 4.0 (55fe778) to 4.1a1 (8a72967)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -32,7 +32,7 @@ from .util import FeedParserDict
|
||||||
|
|
||||||
__author__ = 'Kurt McKee <contactme@kurtmckee.org>'
|
__author__ = 'Kurt McKee <contactme@kurtmckee.org>'
|
||||||
__license__ = 'BSD 2-clause'
|
__license__ = 'BSD 2-clause'
|
||||||
__version__ = '6.0.1'
|
__version__ = '6.0.10'
|
||||||
|
|
||||||
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
||||||
# If you are embedding feedparser in a larger application, you should
|
# If you are embedding feedparser in a larger application, you should
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# The public API for feedparser
|
# The public API for feedparser
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -26,7 +26,11 @@
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
import datetime
|
||||||
import io
|
import io
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Union
|
||||||
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import xml.sax
|
import xml.sax
|
||||||
|
|
||||||
|
@ -34,13 +38,12 @@ import sgmllib3k as sgmllib
|
||||||
|
|
||||||
from .datetimes import registerDateHandler, _parse_date
|
from .datetimes import registerDateHandler, _parse_date
|
||||||
from .encodings import convert_to_utf8
|
from .encodings import convert_to_utf8
|
||||||
from .exceptions import *
|
from .html import BaseHTMLProcessor
|
||||||
from .html import _BaseHTMLProcessor
|
|
||||||
from . import http
|
from . import http
|
||||||
from . import mixin
|
from .mixin import XMLParserMixin
|
||||||
from .mixin import _FeedParserMixin
|
from .parsers.loose import LooseXMLParser
|
||||||
from .parsers.loose import _LooseFeedParser
|
from .parsers.strict import StrictXMLParser
|
||||||
from .parsers.strict import _StrictFeedParser
|
from .parsers.json import JSONParser
|
||||||
from .sanitizer import replace_doctype
|
from .sanitizer import replace_doctype
|
||||||
from .urls import convert_to_idn, make_safe_absolute_uri
|
from .urls import convert_to_idn, make_safe_absolute_uri
|
||||||
from .util import FeedParserDict
|
from .util import FeedParserDict
|
||||||
|
@ -70,6 +73,7 @@ SUPPORTED_VERSIONS = {
|
||||||
'atom10': 'Atom 1.0',
|
'atom10': 'Atom 1.0',
|
||||||
'atom': 'Atom (unknown version)',
|
'atom': 'Atom (unknown version)',
|
||||||
'cdf': 'CDF',
|
'cdf': 'CDF',
|
||||||
|
'json1': 'JSON feed 1',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,20 +140,25 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
|
||||||
return url_file_stream_or_string
|
return url_file_stream_or_string
|
||||||
|
|
||||||
|
|
||||||
LooseFeedParser = type(
|
class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
|
||||||
'LooseFeedParser',
|
pass
|
||||||
(_LooseFeedParser, _FeedParserMixin, _BaseHTMLProcessor, object),
|
|
||||||
{},
|
|
||||||
)
|
|
||||||
|
|
||||||
StrictFeedParser = type(
|
class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler):
|
||||||
'StrictFeedParser',
|
pass
|
||||||
(_StrictFeedParser, _FeedParserMixin, xml.sax.handler.ContentHandler, object),
|
|
||||||
{},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None, resolve_relative_uris=None, sanitize_html=None):
|
def parse(
|
||||||
|
url_file_stream_or_string,
|
||||||
|
etag: str = None,
|
||||||
|
modified: Union[str, datetime.datetime, time.struct_time] = None,
|
||||||
|
agent: str = None,
|
||||||
|
referrer: str = None,
|
||||||
|
handlers: List = None,
|
||||||
|
request_headers: Dict[str, str] = None,
|
||||||
|
response_headers: Dict[str, str] = None,
|
||||||
|
resolve_relative_uris: bool = None,
|
||||||
|
sanitize_html: bool = None,
|
||||||
|
) -> FeedParserDict:
|
||||||
"""Parse a feed from a URL, file, stream, or string.
|
"""Parse a feed from a URL, file, stream, or string.
|
||||||
|
|
||||||
:param url_file_stream_or_string:
|
:param url_file_stream_or_string:
|
||||||
|
@ -165,45 +174,46 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
When a URL is not passed the feed location to use in relative URL
|
When a URL is not passed the feed location to use in relative URL
|
||||||
resolution should be passed in the ``Content-Location`` response header
|
resolution should be passed in the ``Content-Location`` response header
|
||||||
(see ``response_headers`` below).
|
(see ``response_headers`` below).
|
||||||
|
:param etag:
|
||||||
:param str etag: HTTP ``ETag`` request header.
|
HTTP ``ETag`` request header.
|
||||||
:param modified: HTTP ``Last-Modified`` request header.
|
:param modified:
|
||||||
:type modified: :class:`str`, :class:`time.struct_time` 9-tuple, or
|
HTTP ``Last-Modified`` request header.
|
||||||
:class:`datetime.datetime`
|
:param agent:
|
||||||
:param str agent: HTTP ``User-Agent`` request header, which defaults to
|
HTTP ``User-Agent`` request header, which defaults to
|
||||||
the value of :data:`feedparser.USER_AGENT`.
|
the value of :data:`feedparser.USER_AGENT`.
|
||||||
:param referrer: HTTP ``Referer`` [sic] request header.
|
:param referrer:
|
||||||
|
HTTP ``Referer`` [sic] request header.
|
||||||
|
:param handlers:
|
||||||
|
A list of handlers that will be passed to urllib2.
|
||||||
:param request_headers:
|
:param request_headers:
|
||||||
A mapping of HTTP header name to HTTP header value to add to the
|
A mapping of HTTP header name to HTTP header value to add to the
|
||||||
request, overriding internally generated values.
|
request, overriding internally generated values.
|
||||||
:type request_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
|
||||||
:param response_headers:
|
:param response_headers:
|
||||||
A mapping of HTTP header name to HTTP header value. Multiple values may
|
A mapping of HTTP header name to HTTP header value. Multiple values may
|
||||||
be joined with a comma. If a HTTP request was made, these headers
|
be joined with a comma. If a HTTP request was made, these headers
|
||||||
override any matching headers in the response. Otherwise this specifies
|
override any matching headers in the response. Otherwise this specifies
|
||||||
the entirety of the response headers.
|
the entirety of the response headers.
|
||||||
:type response_headers: :class:`dict` mapping :class:`str` to :class:`str`
|
:param resolve_relative_uris:
|
||||||
|
|
||||||
:param bool resolve_relative_uris:
|
|
||||||
Should feedparser attempt to resolve relative URIs absolute ones within
|
Should feedparser attempt to resolve relative URIs absolute ones within
|
||||||
HTML content? Defaults to the value of
|
HTML content? Defaults to the value of
|
||||||
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
|
:data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
|
||||||
:param bool sanitize_html:
|
:param sanitize_html:
|
||||||
Should feedparser skip HTML sanitization? Only disable this if you know
|
Should feedparser skip HTML sanitization? Only disable this if you know
|
||||||
what you are doing! Defaults to the value of
|
what you are doing! Defaults to the value of
|
||||||
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
|
:data:`feedparser.SANITIZE_HTML`, which is ``True``.
|
||||||
|
|
||||||
:return: A :class:`FeedParserDict`.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not agent or sanitize_html is None or resolve_relative_uris is None:
|
# Avoid a cyclic import.
|
||||||
import feedparser
|
|
||||||
if not agent:
|
if not agent:
|
||||||
|
import feedparser
|
||||||
agent = feedparser.USER_AGENT
|
agent = feedparser.USER_AGENT
|
||||||
if sanitize_html is None:
|
if sanitize_html is None:
|
||||||
sanitize_html = feedparser.SANITIZE_HTML
|
import feedparser
|
||||||
|
sanitize_html = bool(feedparser.SANITIZE_HTML)
|
||||||
if resolve_relative_uris is None:
|
if resolve_relative_uris is None:
|
||||||
resolve_relative_uris = feedparser.RESOLVE_RELATIVE_URIS
|
import feedparser
|
||||||
|
resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
|
||||||
|
|
||||||
result = FeedParserDict(
|
result = FeedParserDict(
|
||||||
bozo=False,
|
bozo=False,
|
||||||
|
@ -212,7 +222,14 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
headers={},
|
headers={},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
|
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
|
||||||
|
except urllib.error.URLError as error:
|
||||||
|
result.update({
|
||||||
|
'bozo': True,
|
||||||
|
'bozo_exception': error,
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
|
||||||
if not data:
|
if not data:
|
||||||
return result
|
return result
|
||||||
|
@ -221,8 +238,10 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
result['headers'].update(response_headers or {})
|
result['headers'].update(response_headers or {})
|
||||||
|
|
||||||
data = convert_to_utf8(result['headers'], data, result)
|
data = convert_to_utf8(result['headers'], data, result)
|
||||||
|
use_json_parser = result['content-type'] == 'application/json'
|
||||||
use_strict_parser = result['encoding'] and True or False
|
use_strict_parser = result['encoding'] and True or False
|
||||||
|
|
||||||
|
if not use_json_parser:
|
||||||
result['version'], data, entities = replace_doctype(data)
|
result['version'], data, entities = replace_doctype(data)
|
||||||
|
|
||||||
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
|
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
|
||||||
|
@ -235,36 +254,52 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
||||||
baselang = baselang.decode('utf-8', 'ignore')
|
baselang = baselang.decode('utf-8', 'ignore')
|
||||||
|
|
||||||
if not _XML_AVAILABLE:
|
if not _XML_AVAILABLE:
|
||||||
use_strict_parser = 0
|
use_strict_parser = False
|
||||||
if use_strict_parser:
|
feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
|
||||||
# initialize the SAX parser
|
if use_json_parser:
|
||||||
feedparser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
result['version'] = None
|
||||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
feed_parser = JSONParser(baseuri, baselang, 'utf-8')
|
||||||
feedparser.sanitize_html = sanitize_html
|
try:
|
||||||
|
feed_parser.feed(data)
|
||||||
|
except Exception as e:
|
||||||
|
result['bozo'] = 1
|
||||||
|
result['bozo_exception'] = e
|
||||||
|
elif use_strict_parser:
|
||||||
|
# Initialize the SAX parser.
|
||||||
|
feed_parser = StrictFeedParser(baseuri, baselang, 'utf-8')
|
||||||
|
feed_parser.resolve_relative_uris = resolve_relative_uris
|
||||||
|
feed_parser.sanitize_html = sanitize_html
|
||||||
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
|
||||||
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
|
||||||
try:
|
try:
|
||||||
# disable downloading external doctype references, if possible
|
# Disable downloading external doctype references, if possible.
|
||||||
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
|
saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
|
||||||
except xml.sax.SAXNotSupportedException:
|
except xml.sax.SAXNotSupportedException:
|
||||||
pass
|
pass
|
||||||
saxparser.setContentHandler(feedparser)
|
saxparser.setContentHandler(feed_parser)
|
||||||
saxparser.setErrorHandler(feedparser)
|
saxparser.setErrorHandler(feed_parser)
|
||||||
source = xml.sax.xmlreader.InputSource()
|
source = xml.sax.xmlreader.InputSource()
|
||||||
source.setByteStream(io.BytesIO(data))
|
source.setByteStream(io.BytesIO(data))
|
||||||
try:
|
try:
|
||||||
saxparser.parse(source)
|
saxparser.parse(source)
|
||||||
except xml.sax.SAXException as e:
|
except xml.sax.SAXException as e:
|
||||||
result['bozo'] = 1
|
result['bozo'] = 1
|
||||||
result['bozo_exception'] = feedparser.exc or e
|
result['bozo_exception'] = feed_parser.exc or e
|
||||||
use_strict_parser = 0
|
use_strict_parser = False
|
||||||
if not use_strict_parser:
|
|
||||||
feedparser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
# The loose XML parser will be tried if the JSON parser was not used,
|
||||||
feedparser.resolve_relative_uris = resolve_relative_uris
|
# and if the strict XML parser was not used (or if it failed).
|
||||||
feedparser.sanitize_html = sanitize_html
|
if not use_json_parser and not use_strict_parser:
|
||||||
feedparser.feed(data.decode('utf-8', 'replace'))
|
feed_parser = LooseFeedParser(baseuri, baselang, 'utf-8', entities)
|
||||||
result['feed'] = feedparser.feeddata
|
feed_parser.resolve_relative_uris = resolve_relative_uris
|
||||||
result['entries'] = feedparser.entries
|
feed_parser.sanitize_html = sanitize_html
|
||||||
result['version'] = result['version'] or feedparser.version
|
feed_parser.feed(data.decode('utf-8', 'replace'))
|
||||||
result['namespaces'] = feedparser.namespaces_in_use
|
|
||||||
|
result['feed'] = feed_parser.feeddata
|
||||||
|
result['entries'] = feed_parser.entries
|
||||||
|
result['version'] = result['version'] or feed_parser.version
|
||||||
|
if isinstance(feed_parser, JSONParser):
|
||||||
|
result['namespaces'] = {}
|
||||||
|
else:
|
||||||
|
result['namespaces'] = feed_parser.namespaces_in_use
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -25,6 +25,8 @@
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
from time import struct_time
|
||||||
|
from typing import Callable, List, Optional
|
||||||
from .asctime import _parse_date_asctime
|
from .asctime import _parse_date_asctime
|
||||||
from .greek import _parse_date_greek
|
from .greek import _parse_date_greek
|
||||||
from .hungarian import _parse_date_hungarian
|
from .hungarian import _parse_date_hungarian
|
||||||
|
@ -34,7 +36,7 @@ from .perforce import _parse_date_perforce
|
||||||
from .rfc822 import _parse_date_rfc822
|
from .rfc822 import _parse_date_rfc822
|
||||||
from .w3dtf import _parse_date_w3dtf
|
from .w3dtf import _parse_date_w3dtf
|
||||||
|
|
||||||
_date_handlers = []
|
_date_handlers: List[Callable[[str], Optional[struct_time]]] = []
|
||||||
|
|
||||||
|
|
||||||
def registerDateHandler(func):
|
def registerDateHandler(func):
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -68,15 +68,7 @@ _iso8601_re = [
|
||||||
+ r'(\.(?P<fracsecond>\d+))?'
|
+ r'(\.(?P<fracsecond>\d+))?'
|
||||||
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
|
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
|
||||||
for tmpl in _iso8601_tmpl]
|
for tmpl in _iso8601_tmpl]
|
||||||
try:
|
|
||||||
del tmpl
|
|
||||||
except NameError:
|
|
||||||
pass
|
|
||||||
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
|
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
|
||||||
try:
|
|
||||||
del regex
|
|
||||||
except NameError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_date_iso8601(date_string):
|
def _parse_date_iso8601(date_string):
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -25,7 +25,7 @@
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
import email._parseaddr
|
import email.utils
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -41,6 +41,6 @@ def _parse_date_perforce(date_string):
|
||||||
dow, year, month, day, hour, minute, second, tz = m.groups()
|
dow, year, month, day, hour, minute, second, tz = m.groups()
|
||||||
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||||
new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
|
new_date_string = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
|
||||||
tm = email._parseaddr.parsedate_tz(new_date_string)
|
tm = email.utils.parsedate_tz(new_date_string)
|
||||||
if tm:
|
if tm:
|
||||||
return time.gmtime(email._parseaddr.mktime_tz(tm))
|
return time.gmtime(email.utils.mktime_tz(tm))
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Character encoding routines
|
# Character encoding routines
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -26,17 +26,16 @@
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
import cgi
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
import re
|
||||||
|
import typing as t
|
||||||
|
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
import cchardet as chardet
|
import cchardet as chardet # type: ignore[import]
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import chardet
|
import chardet # type: ignore[no-redef]
|
||||||
except ImportError:
|
except ImportError:
|
||||||
chardet = None
|
|
||||||
lazy_chardet_encoding = None
|
lazy_chardet_encoding = None
|
||||||
else:
|
else:
|
||||||
def lazy_chardet_encoding(data):
|
def lazy_chardet_encoding(data):
|
||||||
|
@ -68,6 +67,30 @@ RE_XML_DECLARATION = re.compile(r'^<\?xml[^>]*?>')
|
||||||
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
|
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_content_type(line: str) -> t.Tuple[str, str]:
|
||||||
|
"""Parse an HTTP Content-Type header.
|
||||||
|
|
||||||
|
The return value will be a tuple of strings:
|
||||||
|
the MIME type, and the value of the "charset" (if any).
|
||||||
|
|
||||||
|
This is a custom replacement for Python's cgi.parse_header().
|
||||||
|
The cgi module will be removed in Python 3.13.
|
||||||
|
"""
|
||||||
|
|
||||||
|
chunks = line.split(";")
|
||||||
|
if not chunks:
|
||||||
|
return "", ""
|
||||||
|
|
||||||
|
mime_type = chunks[0].strip()
|
||||||
|
charset_value = ""
|
||||||
|
for chunk in chunks[1:]:
|
||||||
|
key, _, value = chunk.partition("=")
|
||||||
|
if key.strip().lower() == "charset":
|
||||||
|
charset_value = value.strip().strip("\"'")
|
||||||
|
|
||||||
|
return mime_type, charset_value
|
||||||
|
|
||||||
|
|
||||||
def convert_to_utf8(http_headers, data, result):
|
def convert_to_utf8(http_headers, data, result):
|
||||||
"""Detect and convert the character encoding to UTF-8.
|
"""Detect and convert the character encoding to UTF-8.
|
||||||
|
|
||||||
|
@ -156,10 +179,7 @@ def convert_to_utf8(http_headers, data, result):
|
||||||
try:
|
try:
|
||||||
if bom_encoding:
|
if bom_encoding:
|
||||||
tempdata = data.decode(bom_encoding).encode('utf-8')
|
tempdata = data.decode(bom_encoding).encode('utf-8')
|
||||||
except (UnicodeDecodeError, LookupError):
|
except UnicodeDecodeError:
|
||||||
# feedparser recognizes UTF-32 encodings that aren't
|
|
||||||
# available in Python 2.4 and 2.5, so it's possible to
|
|
||||||
# encounter a LookupError during decoding.
|
|
||||||
xml_encoding_match = None
|
xml_encoding_match = None
|
||||||
else:
|
else:
|
||||||
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
|
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
|
||||||
|
@ -181,15 +201,14 @@ def convert_to_utf8(http_headers, data, result):
|
||||||
# XML declaration encoding, and HTTP encoding, following the
|
# XML declaration encoding, and HTTP encoding, following the
|
||||||
# heuristic defined in RFC 3023.
|
# heuristic defined in RFC 3023.
|
||||||
http_content_type = http_headers.get('content-type') or ''
|
http_content_type = http_headers.get('content-type') or ''
|
||||||
http_content_type, params = cgi.parse_header(http_content_type)
|
http_content_type, http_encoding = parse_content_type(http_content_type)
|
||||||
http_encoding = params.get('charset', '').replace("'", "")
|
|
||||||
if isinstance(http_encoding, bytes):
|
|
||||||
http_encoding = http_encoding.decode('utf-8', 'ignore')
|
|
||||||
|
|
||||||
acceptable_content_type = 0
|
acceptable_content_type = 0
|
||||||
application_content_types = ('application/xml', 'application/xml-dtd',
|
application_content_types = ('application/xml', 'application/xml-dtd',
|
||||||
'application/xml-external-parsed-entity')
|
'application/xml-external-parsed-entity')
|
||||||
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
|
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
|
||||||
|
json_content_types = ('application/feed+json', 'application/json')
|
||||||
|
json = False
|
||||||
if (
|
if (
|
||||||
http_content_type in application_content_types
|
http_content_type in application_content_types
|
||||||
or (
|
or (
|
||||||
|
@ -208,6 +227,17 @@ def convert_to_utf8(http_headers, data, result):
|
||||||
):
|
):
|
||||||
acceptable_content_type = 1
|
acceptable_content_type = 1
|
||||||
rfc3023_encoding = http_encoding or 'us-ascii'
|
rfc3023_encoding = http_encoding or 'us-ascii'
|
||||||
|
elif (
|
||||||
|
http_content_type in json_content_types
|
||||||
|
or (
|
||||||
|
not http_content_type
|
||||||
|
and data and data.lstrip()[0] == '{'
|
||||||
|
)
|
||||||
|
):
|
||||||
|
http_content_type = json_content_types[0]
|
||||||
|
acceptable_content_type = 1
|
||||||
|
json = True
|
||||||
|
rfc3023_encoding = http_encoding or 'utf-8' # RFC 7159, 8.1.
|
||||||
elif http_content_type.startswith('text/'):
|
elif http_content_type.startswith('text/'):
|
||||||
rfc3023_encoding = http_encoding or 'us-ascii'
|
rfc3023_encoding = http_encoding or 'us-ascii'
|
||||||
elif http_headers and 'content-type' not in http_headers:
|
elif http_headers and 'content-type' not in http_headers:
|
||||||
|
@ -230,7 +260,7 @@ def convert_to_utf8(http_headers, data, result):
|
||||||
|
|
||||||
if http_headers and (not acceptable_content_type):
|
if http_headers and (not acceptable_content_type):
|
||||||
if 'content-type' in http_headers:
|
if 'content-type' in http_headers:
|
||||||
msg = '%s is not an XML media type' % http_headers['content-type']
|
msg = '%s is not an accepted media type' % http_headers['content-type']
|
||||||
else:
|
else:
|
||||||
msg = 'no Content-type specified'
|
msg = 'no Content-type specified'
|
||||||
error = NonXMLContentType(msg)
|
error = NonXMLContentType(msg)
|
||||||
|
@ -254,6 +284,7 @@ def convert_to_utf8(http_headers, data, result):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
known_encoding = 1
|
known_encoding = 1
|
||||||
|
if not json:
|
||||||
# Update the encoding in the opening XML processing instruction.
|
# Update the encoding in the opening XML processing instruction.
|
||||||
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
|
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
|
||||||
if RE_XML_DECLARATION.search(data):
|
if RE_XML_DECLARATION.search(data):
|
||||||
|
@ -275,6 +306,7 @@ def convert_to_utf8(http_headers, data, result):
|
||||||
(rfc3023_encoding, proposed_encoding))
|
(rfc3023_encoding, proposed_encoding))
|
||||||
rfc3023_encoding = proposed_encoding
|
rfc3023_encoding = proposed_encoding
|
||||||
|
|
||||||
|
result['content-type'] = http_content_type # for selecting the parser
|
||||||
result['encoding'] = rfc3023_encoding
|
result['encoding'] = rfc3023_encoding
|
||||||
if error:
|
if error:
|
||||||
result['bozo'] = True
|
result['bozo'] = True
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Exceptions used throughout feedparser
|
# Exceptions used throughout feedparser
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -27,7 +27,7 @@
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'ThingsNobodyCaresAboutButMe',
|
'FeedparserError',
|
||||||
'CharacterEncodingOverride',
|
'CharacterEncodingOverride',
|
||||||
'CharacterEncodingUnknown',
|
'CharacterEncodingUnknown',
|
||||||
'NonXMLContentType',
|
'NonXMLContentType',
|
||||||
|
@ -35,19 +35,19 @@ __all__ = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class ThingsNobodyCaresAboutButMe(Exception):
|
class FeedparserError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
|
class CharacterEncodingOverride(FeedparserError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
|
class CharacterEncodingUnknown(FeedparserError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class NonXMLContentType(ThingsNobodyCaresAboutButMe):
|
class NonXMLContentType(FeedparserError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -61,7 +61,7 @@ _cp1252 = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
class BaseHTMLProcessor(sgmllib.SGMLParser):
|
||||||
special = re.compile("""[<>'"]""")
|
special = re.compile("""[<>'"]""")
|
||||||
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
|
bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
|
||||||
elements_no_end_tag = {
|
elements_no_end_tag = {
|
||||||
|
@ -91,11 +91,11 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
self._type = _type
|
self._type = _type
|
||||||
self.pieces = []
|
self.pieces = []
|
||||||
super(_BaseHTMLProcessor, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.pieces = []
|
self.pieces = []
|
||||||
super(_BaseHTMLProcessor, self).reset()
|
super().reset()
|
||||||
|
|
||||||
def _shorttag_replace(self, match):
|
def _shorttag_replace(self, match):
|
||||||
"""
|
"""
|
||||||
|
@ -118,23 +118,13 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
# Replace goahead with SGMLParser's goahead() code object.
|
# Replace goahead with SGMLParser's goahead() code object.
|
||||||
try:
|
|
||||||
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
|
goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
|
||||||
except AttributeError:
|
|
||||||
# Python 2
|
|
||||||
# noinspection PyUnresolvedReferences
|
|
||||||
goahead.func_code = sgmllib.SGMLParser.goahead.func_code
|
|
||||||
|
|
||||||
def __parse_starttag(self, i):
|
def __parse_starttag(self, i):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
# Replace __parse_starttag with SGMLParser's parse_starttag() code object.
|
# Replace __parse_starttag with SGMLParser's parse_starttag() code object.
|
||||||
try:
|
|
||||||
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
|
__parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
|
||||||
except AttributeError:
|
|
||||||
# Python 2
|
|
||||||
# noinspection PyUnresolvedReferences
|
|
||||||
__parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
|
|
||||||
|
|
||||||
def parse_starttag(self, i):
|
def parse_starttag(self, i):
|
||||||
j = self.__parse_starttag(i)
|
j = self.__parse_starttag(i)
|
||||||
|
@ -153,8 +143,8 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||||
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
|
data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
|
||||||
data = data.replace(''', "'")
|
data = data.replace(''', "'")
|
||||||
data = data.replace('"', '"')
|
data = data.replace('"', '"')
|
||||||
super(_BaseHTMLProcessor, self).feed(data)
|
super().feed(data)
|
||||||
super(_BaseHTMLProcessor, self).close()
|
super().close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def normalize_attrs(attrs):
|
def normalize_attrs(attrs):
|
||||||
|
@ -315,8 +305,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||||
# self.updatepos(declstartpos, i)
|
# self.updatepos(declstartpos, i)
|
||||||
return None, -1
|
return None, -1
|
||||||
|
|
||||||
@staticmethod
|
def convert_charref(self, name):
|
||||||
def convert_charref(name):
|
|
||||||
"""
|
"""
|
||||||
:type name: str
|
:type name: str
|
||||||
:rtype: str
|
:rtype: str
|
||||||
|
@ -324,8 +313,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||||
|
|
||||||
return '&#%s;' % name
|
return '&#%s;' % name
|
||||||
|
|
||||||
@staticmethod
|
def convert_entityref(self, name):
|
||||||
def convert_entityref(name):
|
|
||||||
"""
|
"""
|
||||||
:type name: str
|
:type name: str
|
||||||
:rtype: str
|
:rtype: str
|
||||||
|
@ -349,7 +337,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser, object):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return sgmllib.SGMLParser.parse_declaration(self, i)
|
return sgmllib.SGMLParser.parse_declaration(self, i)
|
||||||
except sgmllib.SGMLParseError:
|
except (AssertionError, sgmllib.SGMLParseError):
|
||||||
# Escape the doctype declaration and continue parsing.
|
# Escape the doctype declaration and continue parsing.
|
||||||
self.handle_data('<')
|
self.handle_data('<')
|
||||||
return i+1
|
return i+1
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -44,7 +44,7 @@ from .urls import convert_to_idn
|
||||||
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
|
||||||
|
|
||||||
|
|
||||||
class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
|
class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
|
||||||
def http_error_default(self, req, fp, code, msg, headers):
|
def http_error_default(self, req, fp, code, msg, headers):
|
||||||
# The default implementation just raises HTTPError.
|
# The default implementation just raises HTTPError.
|
||||||
# Forget that.
|
# Forget that.
|
||||||
|
@ -53,6 +53,8 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
|
||||||
|
|
||||||
def http_error_301(self, req, fp, code, msg, hdrs):
|
def http_error_301(self, req, fp, code, msg, hdrs):
|
||||||
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
|
result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
|
||||||
|
if not result:
|
||||||
|
return fp
|
||||||
result.status = code
|
result.status = code
|
||||||
result.newurl = result.geturl()
|
result.newurl = result.geturl()
|
||||||
return result
|
return result
|
||||||
|
@ -78,7 +80,7 @@ class _FeedURLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPR
|
||||||
host = urllib.parse.urlparse(req.get_full_url())[1]
|
host = urllib.parse.urlparse(req.get_full_url())[1]
|
||||||
if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
|
if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
|
||||||
return self.http_error_default(req, fp, code, msg, headers)
|
return self.http_error_default(req, fp, code, msg, headers)
|
||||||
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode('utf8'))
|
auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
|
||||||
user, passw = auth.split(':')
|
user, passw = auth.split(':')
|
||||||
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
|
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
|
||||||
self.add_password(realm, host, user, passw)
|
self.add_password(realm, host, user, passw)
|
||||||
|
@ -145,15 +147,26 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
|
||||||
if url_pieces.port:
|
if url_pieces.port:
|
||||||
new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
|
new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
|
||||||
url = urllib.parse.urlunparse(new_pieces)
|
url = urllib.parse.urlunparse(new_pieces)
|
||||||
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}').strip()
|
auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()
|
||||||
|
|
||||||
# iri support
|
# iri support
|
||||||
if not isinstance(url, bytes):
|
if not isinstance(url, bytes):
|
||||||
url = convert_to_idn(url)
|
url = convert_to_idn(url)
|
||||||
|
|
||||||
|
# Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
|
||||||
|
bits = []
|
||||||
|
for c in url:
|
||||||
|
try:
|
||||||
|
c.encode('ascii')
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
bits.append(urllib.parse.quote(c))
|
||||||
|
else:
|
||||||
|
bits.append(c)
|
||||||
|
url = ''.join(bits)
|
||||||
|
|
||||||
# try to open with urllib2 (to use optional headers)
|
# try to open with urllib2 (to use optional headers)
|
||||||
request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
|
request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
|
||||||
opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
|
opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
|
||||||
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
|
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
|
||||||
f = opener.open(request)
|
f = opener.open(request)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
|
@ -203,7 +216,7 @@ def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None,
|
||||||
result['href'] = f.url.decode('utf-8', 'ignore')
|
result['href'] = f.url.decode('utf-8', 'ignore')
|
||||||
else:
|
else:
|
||||||
result['href'] = f.url
|
result['href'] = f.url
|
||||||
result['status'] = getattr(f, 'status', 200)
|
result['status'] = getattr(f, 'status', None) or 200
|
||||||
|
|
||||||
# Stop processing if the server sent HTTP 304 Not Modified.
|
# Stop processing if the server sent HTTP 304 Not Modified.
|
||||||
if getattr(f, 'code', 0) == 304:
|
if getattr(f, 'code', 0) == 304:
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -30,16 +30,17 @@ import binascii
|
||||||
import copy
|
import copy
|
||||||
import html.entities
|
import html.entities
|
||||||
import re
|
import re
|
||||||
|
from typing import Dict
|
||||||
import xml.sax.saxutils
|
import xml.sax.saxutils
|
||||||
|
|
||||||
from .html import _cp1252
|
from .html import _cp1252
|
||||||
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
|
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
|
||||||
from .sanitizer import _sanitize_html, _HTMLSanitizer
|
from .sanitizer import sanitize_html, HTMLSanitizer
|
||||||
from .util import FeedParserDict
|
from .util import FeedParserDict
|
||||||
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
|
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
|
||||||
|
|
||||||
|
|
||||||
class _FeedParserMixin(
|
class XMLParserMixin(
|
||||||
_base.Namespace,
|
_base.Namespace,
|
||||||
cc.Namespace,
|
cc.Namespace,
|
||||||
dc.Namespace,
|
dc.Namespace,
|
||||||
|
@ -118,7 +119,7 @@ class _FeedParserMixin(
|
||||||
'http://www.w3.org/XML/1998/namespace': 'xml',
|
'http://www.w3.org/XML/1998/namespace': 'xml',
|
||||||
'http://podlove.org/simple-chapters': 'psc',
|
'http://podlove.org/simple-chapters': 'psc',
|
||||||
}
|
}
|
||||||
_matchnamespaces = {}
|
_matchnamespaces: Dict[str, str] = {}
|
||||||
|
|
||||||
can_be_relative_uri = {
|
can_be_relative_uri = {
|
||||||
'comments',
|
'comments',
|
||||||
|
@ -170,6 +171,8 @@ class _FeedParserMixin(
|
||||||
self.entries = [] # list of entry-level data
|
self.entries = [] # list of entry-level data
|
||||||
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
|
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
|
||||||
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
|
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
|
||||||
|
self.resolve_relative_uris = False
|
||||||
|
self.sanitize_html = False
|
||||||
|
|
||||||
# the following are used internally to track state;
|
# the following are used internally to track state;
|
||||||
# this is really out of control and should be refactored
|
# this is really out of control and should be refactored
|
||||||
|
@ -193,6 +196,7 @@ class _FeedParserMixin(
|
||||||
self.svgOK = 0
|
self.svgOK = 0
|
||||||
self.title_depth = -1
|
self.title_depth = -1
|
||||||
self.depth = 0
|
self.depth = 0
|
||||||
|
self.hasContent = 0
|
||||||
if self.lang:
|
if self.lang:
|
||||||
self.feeddata['language'] = self.lang.replace('_', '-')
|
self.feeddata['language'] = self.lang.replace('_', '-')
|
||||||
|
|
||||||
|
@ -204,7 +208,7 @@ class _FeedParserMixin(
|
||||||
# },
|
# },
|
||||||
# }
|
# }
|
||||||
self.property_depth_map = {}
|
self.property_depth_map = {}
|
||||||
super(_FeedParserMixin, self).__init__()
|
super(XMLParserMixin, self).__init__()
|
||||||
|
|
||||||
def _normalize_attributes(self, kv):
|
def _normalize_attributes(self, kv):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
@ -506,9 +510,7 @@ class _FeedParserMixin(
|
||||||
if base64 and self.contentparams.get('base64', 0):
|
if base64 and self.contentparams.get('base64', 0):
|
||||||
try:
|
try:
|
||||||
output = base64.decodebytes(output.encode('utf8')).decode('utf8')
|
output = base64.decodebytes(output.encode('utf8')).decode('utf8')
|
||||||
except binascii.Error:
|
except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
|
||||||
pass
|
|
||||||
except binascii.Incomplete:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# resolve relative URIs
|
# resolve relative URIs
|
||||||
|
@ -546,7 +548,7 @@ class _FeedParserMixin(
|
||||||
# sanitize embedded markup
|
# sanitize embedded markup
|
||||||
if is_htmlish and self.sanitize_html:
|
if is_htmlish and self.sanitize_html:
|
||||||
if element in self.can_contain_dangerous_markup:
|
if element in self.can_contain_dangerous_markup:
|
||||||
output = _sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
output = sanitize_html(output, self.encoding, self.contentparams.get('type', 'text/html'))
|
||||||
|
|
||||||
if self.encoding and isinstance(output, bytes):
|
if self.encoding and isinstance(output, bytes):
|
||||||
output = output.decode(self.encoding, 'ignore')
|
output = output.decode(self.encoding, 'ignore')
|
||||||
|
@ -648,7 +650,7 @@ class _FeedParserMixin(
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# all tags must be in a restricted subset of valid HTML tags
|
# all tags must be in a restricted subset of valid HTML tags
|
||||||
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in _HTMLSanitizer.acceptable_elements)):
|
if any((t for t in re.findall(r'</?(\w+)', s) if t.lower() not in HTMLSanitizer.acceptable_elements)):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# all entities must have been defined as valid HTML entities
|
# all entities must have been defined as valid HTML entities
|
||||||
|
@ -744,7 +746,7 @@ class _FeedParserMixin(
|
||||||
author, email = context.get(key), None
|
author, email = context.get(key), None
|
||||||
if not author:
|
if not author:
|
||||||
return
|
return
|
||||||
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
|
emailmatch = re.search(r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))(\?subject=\S+)?", author)
|
||||||
if emailmatch:
|
if emailmatch:
|
||||||
email = emailmatch.group(0)
|
email = emailmatch.group(0)
|
||||||
# probably a better way to do the following, but it passes
|
# probably a better way to do the following, but it passes
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the Atom, RSS, RDF, and CDF feed formats
|
# Support for the Atom, RSS, RDF, and CDF feed formats
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -259,6 +259,7 @@ class Namespace(object):
|
||||||
def _end_item(self):
|
def _end_item(self):
|
||||||
self.pop('item')
|
self.pop('item')
|
||||||
self.inentry = 0
|
self.inentry = 0
|
||||||
|
self.hasContent = 0
|
||||||
_end_entry = _end_item
|
_end_entry = _end_item
|
||||||
|
|
||||||
def _start_language(self, attrs_d):
|
def _start_language(self, attrs_d):
|
||||||
|
@ -388,7 +389,7 @@ class Namespace(object):
|
||||||
|
|
||||||
def _start_description(self, attrs_d):
|
def _start_description(self, attrs_d):
|
||||||
context = self._get_context()
|
context = self._get_context()
|
||||||
if 'summary' in context:
|
if 'summary' in context and not self.hasContent:
|
||||||
self._summaryKey = 'content'
|
self._summaryKey = 'content'
|
||||||
self._start_content(attrs_d)
|
self._start_content(attrs_d)
|
||||||
else:
|
else:
|
||||||
|
@ -429,7 +430,7 @@ class Namespace(object):
|
||||||
|
|
||||||
def _start_summary(self, attrs_d):
|
def _start_summary(self, attrs_d):
|
||||||
context = self._get_context()
|
context = self._get_context()
|
||||||
if 'summary' in context:
|
if 'summary' in context and not self.hasContent:
|
||||||
self._summaryKey = 'content'
|
self._summaryKey = 'content'
|
||||||
self._start_content(attrs_d)
|
self._start_content(attrs_d)
|
||||||
else:
|
else:
|
||||||
|
@ -466,6 +467,7 @@ class Namespace(object):
|
||||||
self.sourcedata.clear()
|
self.sourcedata.clear()
|
||||||
|
|
||||||
def _start_content(self, attrs_d):
|
def _start_content(self, attrs_d):
|
||||||
|
self.hasContent = 1
|
||||||
self.push_content('content', attrs_d, 'text/plain', 1)
|
self.push_content('content', attrs_d, 'text/plain', 1)
|
||||||
src = attrs_d.get('src')
|
src = attrs_d.get('src')
|
||||||
if src:
|
if src:
|
||||||
|
@ -477,6 +479,7 @@ class Namespace(object):
|
||||||
_start_xhtml_body = _start_body
|
_start_xhtml_body = _start_body
|
||||||
|
|
||||||
def _start_content_encoded(self, attrs_d):
|
def _start_content_encoded(self, attrs_d):
|
||||||
|
self.hasContent = 1
|
||||||
self.push_content('content', attrs_d, 'text/html', 1)
|
self.push_content('content', attrs_d, 'text/html', 1)
|
||||||
_start_fullitem = _start_content_encoded
|
_start_fullitem = _start_content_encoded
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the administrative elements extension
|
# Support for the administrative elements extension
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the Creative Commons licensing extensions
|
# Support for the Creative Commons licensing extensions
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the Dublin Core metadata extensions
|
# Support for the Dublin Core metadata extensions
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the GeoRSS format
|
# Support for the GeoRSS format
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -91,6 +91,8 @@ class Namespace(object):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
srs_dimension = 2
|
srs_dimension = 2
|
||||||
context = self._get_context()
|
context = self._get_context()
|
||||||
|
if 'where' not in context:
|
||||||
|
context['where'] = {}
|
||||||
context['where']['srsName'] = srs_name
|
context['where']['srsName'] = srs_name
|
||||||
context['where']['srsDimension'] = srs_dimension
|
context['where']['srsDimension'] = srs_dimension
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the iTunes format
|
# Support for the iTunes format
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the Media RSS format
|
# Support for the Media RSS format
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Support for the Podlove Simple Chapters format
|
# Support for the Podlove Simple Chapters format
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
|
133
lib/feedparser/parsers/json.py
Normal file
133
lib/feedparser/parsers/json.py
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
# The JSON feed parser
|
||||||
|
# Copyright 2017 Beat Bolli
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This file is a part of feedparser.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
# are permitted provided that the following conditions are met:
|
||||||
|
#
|
||||||
|
# * Redistributions of source code must retain the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
# this list of conditions and the following disclaimer in the documentation
|
||||||
|
# and/or other materials provided with the distribution.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from ..datetimes import _parse_date
|
||||||
|
from ..sanitizer import sanitize_html
|
||||||
|
from ..util import FeedParserDict
|
||||||
|
|
||||||
|
|
||||||
|
class JSONParser:
|
||||||
|
VERSIONS = {
|
||||||
|
'https://jsonfeed.org/version/1': 'json1',
|
||||||
|
'https://jsonfeed.org/version/1.1': 'json11',
|
||||||
|
}
|
||||||
|
FEED_FIELDS = (
|
||||||
|
('title', 'title'),
|
||||||
|
('icon', 'image'),
|
||||||
|
('home_page_url', 'link'),
|
||||||
|
('description', 'description'),
|
||||||
|
)
|
||||||
|
ITEM_FIELDS = (
|
||||||
|
('title', 'title'),
|
||||||
|
('id', 'guid'),
|
||||||
|
('url', 'link'),
|
||||||
|
('summary', 'summary'),
|
||||||
|
('external_url', 'source'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, baseuri=None, baselang=None, encoding=None):
|
||||||
|
self.baseuri = baseuri or ''
|
||||||
|
self.lang = baselang or None
|
||||||
|
self.encoding = encoding or 'utf-8' # character encoding
|
||||||
|
|
||||||
|
self.version = None
|
||||||
|
self.feeddata = FeedParserDict()
|
||||||
|
self.namespacesInUse = []
|
||||||
|
self.entries = []
|
||||||
|
|
||||||
|
def feed(self, data):
|
||||||
|
data = json.loads(data)
|
||||||
|
|
||||||
|
v = data.get('version', '')
|
||||||
|
try:
|
||||||
|
self.version = self.VERSIONS[v]
|
||||||
|
except KeyError:
|
||||||
|
raise ValueError("Unrecognized JSONFeed version '%s'" % v)
|
||||||
|
|
||||||
|
for src, dst in self.FEED_FIELDS:
|
||||||
|
if src in data:
|
||||||
|
self.feeddata[dst] = data[src]
|
||||||
|
if 'author' in data:
|
||||||
|
self.parse_author(data['author'], self.feeddata)
|
||||||
|
# TODO: hubs; expired has no RSS equivalent
|
||||||
|
|
||||||
|
self.entries = [self.parse_entry(e) for e in data['items']]
|
||||||
|
|
||||||
|
def parse_entry(self, e):
|
||||||
|
entry = FeedParserDict()
|
||||||
|
for src, dst in self.ITEM_FIELDS:
|
||||||
|
if src in e:
|
||||||
|
entry[dst] = e[src]
|
||||||
|
|
||||||
|
if 'content_text' in e:
|
||||||
|
entry['content'] = c = FeedParserDict()
|
||||||
|
c['value'] = e['content_text']
|
||||||
|
c['type'] = 'text'
|
||||||
|
elif 'content_html' in e:
|
||||||
|
entry['content'] = c = FeedParserDict()
|
||||||
|
c['value'] = sanitize_html(e['content_html'], self.encoding, 'application/json')
|
||||||
|
c['type'] = 'html'
|
||||||
|
|
||||||
|
if 'date_published' in e:
|
||||||
|
entry['published'] = e['date_published']
|
||||||
|
entry['published_parsed'] = _parse_date(e['date_published'])
|
||||||
|
if 'date_updated' in e:
|
||||||
|
entry['updated'] = e['date_modified']
|
||||||
|
entry['updated_parsed'] = _parse_date(e['date_modified'])
|
||||||
|
|
||||||
|
if 'tags' in e:
|
||||||
|
entry['category'] = e['tags']
|
||||||
|
|
||||||
|
if 'author' in e:
|
||||||
|
self.parse_author(e['author'], entry)
|
||||||
|
|
||||||
|
if 'attachments' in e:
|
||||||
|
entry['enclosures'] = [self.parse_attachment(a) for a in e['attachments']]
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_author(parent, dest):
|
||||||
|
dest['author_detail'] = detail = FeedParserDict()
|
||||||
|
if 'name' in parent:
|
||||||
|
dest['author'] = detail['name'] = parent['name']
|
||||||
|
if 'url' in parent:
|
||||||
|
if parent['url'].startswith('mailto:'):
|
||||||
|
detail['email'] = parent['url'][7:]
|
||||||
|
else:
|
||||||
|
detail['href'] = parent['url']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse_attachment(attachment):
|
||||||
|
enc = FeedParserDict()
|
||||||
|
enc['href'] = attachment['url']
|
||||||
|
enc['type'] = attachment['mime_type']
|
||||||
|
if 'size_in_bytes' in attachment:
|
||||||
|
enc['length'] = attachment['size_in_bytes']
|
||||||
|
return enc
|
|
@ -1,5 +1,5 @@
|
||||||
# The loose feed parser that interfaces with an SGML parsing library
|
# The loose feed parser that interfaces with an SGML parsing library
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -26,7 +26,7 @@
|
||||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
# POSSIBILITY OF SUCH DAMAGE.
|
# POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
class _LooseFeedParser(object):
|
class LooseXMLParser:
|
||||||
contentparams = None
|
contentparams = None
|
||||||
|
|
||||||
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
|
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
|
||||||
|
@ -34,7 +34,7 @@ class _LooseFeedParser(object):
|
||||||
self.lang = baselang or None
|
self.lang = baselang or None
|
||||||
self.encoding = encoding or 'utf-8' # character encoding
|
self.encoding = encoding or 'utf-8' # character encoding
|
||||||
self.entities = entities or {}
|
self.entities = entities or {}
|
||||||
super(_LooseFeedParser, self).__init__()
|
super().__init__()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_attributes(kv):
|
def _normalize_attributes(kv):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# The strict feed parser that interfaces with an XML parsing library
|
# The strict feed parser that interfaces with an XML parsing library
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -29,7 +29,7 @@
|
||||||
from ..exceptions import UndeclaredNamespace
|
from ..exceptions import UndeclaredNamespace
|
||||||
|
|
||||||
|
|
||||||
class _StrictFeedParser(object):
|
class StrictXMLParser:
|
||||||
def __init__(self, baseuri, baselang, encoding):
|
def __init__(self, baseuri, baselang, encoding):
|
||||||
self.bozo = 0
|
self.bozo = 0
|
||||||
self.exc = None
|
self.exc = None
|
||||||
|
@ -37,7 +37,7 @@ class _StrictFeedParser(object):
|
||||||
self.baseuri = baseuri or ''
|
self.baseuri = baseuri or ''
|
||||||
self.lang = baselang
|
self.lang = baselang
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
super(_StrictFeedParser, self).__init__()
|
super(StrictXMLParser, self).__init__()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_attributes(kv):
|
def _normalize_attributes(kv):
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -27,11 +27,11 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .html import _BaseHTMLProcessor
|
from .html import BaseHTMLProcessor
|
||||||
from .urls import make_safe_absolute_uri
|
from .urls import make_safe_absolute_uri
|
||||||
|
|
||||||
|
|
||||||
class _HTMLSanitizer(_BaseHTMLProcessor):
|
class HTMLSanitizer(BaseHTMLProcessor):
|
||||||
acceptable_elements = {
|
acceptable_elements = {
|
||||||
'a',
|
'a',
|
||||||
'abbr',
|
'abbr',
|
||||||
|
@ -732,14 +732,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, encoding=None, _type='application/xhtml+xml'):
|
def __init__(self, encoding=None, _type='application/xhtml+xml'):
|
||||||
super(_HTMLSanitizer, self).__init__(encoding, _type)
|
super().__init__(encoding, _type)
|
||||||
|
|
||||||
self.unacceptablestack = 0
|
self.unacceptablestack = 0
|
||||||
self.mathmlOK = 0
|
self.mathmlOK = 0
|
||||||
self.svgOK = 0
|
self.svgOK = 0
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super(_HTMLSanitizer, self).reset()
|
super().reset()
|
||||||
self.unacceptablestack = 0
|
self.unacceptablestack = 0
|
||||||
self.mathmlOK = 0
|
self.mathmlOK = 0
|
||||||
self.svgOK = 0
|
self.svgOK = 0
|
||||||
|
@ -805,7 +805,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
if key == 'href':
|
if key == 'href':
|
||||||
value = make_safe_absolute_uri(value)
|
value = make_safe_absolute_uri(value)
|
||||||
clean_attrs.append((key, value))
|
clean_attrs.append((key, value))
|
||||||
super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs)
|
super().unknown_starttag(tag, clean_attrs)
|
||||||
|
|
||||||
def unknown_endtag(self, tag):
|
def unknown_endtag(self, tag):
|
||||||
if tag not in self.acceptable_elements:
|
if tag not in self.acceptable_elements:
|
||||||
|
@ -820,7 +820,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
self.svgOK -= 1
|
self.svgOK -= 1
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
super(_HTMLSanitizer, self).unknown_endtag(tag)
|
super().unknown_endtag(tag)
|
||||||
|
|
||||||
def handle_pi(self, text):
|
def handle_pi(self, text):
|
||||||
pass
|
pass
|
||||||
|
@ -830,7 +830,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
|
|
||||||
def handle_data(self, text):
|
def handle_data(self, text):
|
||||||
if not self.unacceptablestack:
|
if not self.unacceptablestack:
|
||||||
super(_HTMLSanitizer, self).handle_data(text)
|
super().handle_data(text)
|
||||||
|
|
||||||
def sanitize_style(self, style):
|
def sanitize_style(self, style):
|
||||||
# disallow urls
|
# disallow urls
|
||||||
|
@ -865,7 +865,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
return ' '.join(clean)
|
return ' '.join(clean)
|
||||||
|
|
||||||
def parse_comment(self, i, report=1):
|
def parse_comment(self, i, report=1):
|
||||||
ret = super(_HTMLSanitizer, self).parse_comment(i, report)
|
ret = super().parse_comment(i, report)
|
||||||
if ret >= 0:
|
if ret >= 0:
|
||||||
return ret
|
return ret
|
||||||
# if ret == -1, this may be a malicious attempt to circumvent
|
# if ret == -1, this may be a malicious attempt to circumvent
|
||||||
|
@ -877,8 +877,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||||
return len(self.rawdata)
|
return len(self.rawdata)
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_html(html_source, encoding, _type):
|
def sanitize_html(html_source, encoding, _type):
|
||||||
p = _HTMLSanitizer(encoding, _type)
|
p = HTMLSanitizer(encoding, _type)
|
||||||
html_source = html_source.replace('<![CDATA[', '<![CDATA[')
|
html_source = html_source.replace('<![CDATA[', '<![CDATA[')
|
||||||
p.feed(html_source)
|
p.feed(html_source)
|
||||||
data = p.output()
|
data = p.output()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -27,7 +27,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import sgmllib
|
import sgmllib # type: ignore[import]
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'sgmllib',
|
'sgmllib',
|
||||||
|
@ -82,7 +82,7 @@ class _EndBracketRegEx:
|
||||||
match = self.endbracket.match(target, index)
|
match = self.endbracket.match(target, index)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
# Returning a new object in the calling thread's context
|
# Returning a new object in the calling thread's context
|
||||||
# resolves a thread-safety.
|
# resolves a thread-safety issue.
|
||||||
return EndBracketMatch(match)
|
return EndBracketMatch(match)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -28,7 +28,7 @@
|
||||||
import re
|
import re
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
from .html import _BaseHTMLProcessor
|
from .html import BaseHTMLProcessor
|
||||||
|
|
||||||
# If you want feedparser to allow all URL schemes, set this to ()
|
# If you want feedparser to allow all URL schemes, set this to ()
|
||||||
# List culled from Python's urlparse documentation at:
|
# List culled from Python's urlparse documentation at:
|
||||||
|
@ -103,7 +103,7 @@ def make_safe_absolute_uri(base, rel=None):
|
||||||
return uri
|
return uri
|
||||||
|
|
||||||
|
|
||||||
class RelativeURIResolver(_BaseHTMLProcessor):
|
class RelativeURIResolver(BaseHTMLProcessor):
|
||||||
relative_uris = {
|
relative_uris = {
|
||||||
('a', 'href'),
|
('a', 'href'),
|
||||||
('applet', 'codebase'),
|
('applet', 'codebase'),
|
||||||
|
@ -137,7 +137,7 @@ class RelativeURIResolver(_BaseHTMLProcessor):
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, baseuri, encoding, _type):
|
def __init__(self, baseuri, encoding, _type):
|
||||||
_BaseHTMLProcessor.__init__(self, encoding, _type)
|
BaseHTMLProcessor.__init__(self, encoding, _type)
|
||||||
self.baseuri = baseuri
|
self.baseuri = baseuri
|
||||||
|
|
||||||
def resolve_uri(self, uri):
|
def resolve_uri(self, uri):
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright 2010-2020 Kurt McKee <contactme@kurtmckee.org>
|
# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
|
||||||
# Copyright 2002-2008 Mark Pilgrim
|
# Copyright 2002-2008 Mark Pilgrim
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
|
@ -48,7 +48,7 @@ class FeedParserDict(dict):
|
||||||
'tagline_detail': 'subtitle_detail',
|
'tagline_detail': 'subtitle_detail',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key, _stacklevel=2):
|
||||||
"""
|
"""
|
||||||
:return: A :class:`FeedParserDict`.
|
:return: A :class:`FeedParserDict`.
|
||||||
"""
|
"""
|
||||||
|
@ -59,9 +59,8 @@ class FeedParserDict(dict):
|
||||||
except IndexError:
|
except IndexError:
|
||||||
raise KeyError("object doesn't have key 'category'")
|
raise KeyError("object doesn't have key 'category'")
|
||||||
elif key == 'enclosures':
|
elif key == 'enclosures':
|
||||||
norel = lambda link: FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
|
|
||||||
return [
|
return [
|
||||||
norel(link)
|
FeedParserDict([(name, value) for (name, value) in link.items() if name != 'rel'])
|
||||||
for link in dict.__getitem__(self, 'links')
|
for link in dict.__getitem__(self, 'links')
|
||||||
if link['rel'] == 'enclosure'
|
if link['rel'] == 'enclosure'
|
||||||
]
|
]
|
||||||
|
@ -84,6 +83,7 @@ class FeedParserDict(dict):
|
||||||
"exist. This fallback will be removed in a future version "
|
"exist. This fallback will be removed in a future version "
|
||||||
"of feedparser.",
|
"of feedparser.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
|
stacklevel=_stacklevel,
|
||||||
)
|
)
|
||||||
return dict.__getitem__(self, 'published')
|
return dict.__getitem__(self, 'published')
|
||||||
return dict.__getitem__(self, 'updated')
|
return dict.__getitem__(self, 'updated')
|
||||||
|
@ -99,6 +99,7 @@ class FeedParserDict(dict):
|
||||||
"`updated_parsed` doesn't exist. This fallback will be "
|
"`updated_parsed` doesn't exist. This fallback will be "
|
||||||
"removed in a future version of feedparser.",
|
"removed in a future version of feedparser.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
|
stacklevel=_stacklevel,
|
||||||
)
|
)
|
||||||
return dict.__getitem__(self, 'published_parsed')
|
return dict.__getitem__(self, 'published_parsed')
|
||||||
return dict.__getitem__(self, 'updated_parsed')
|
return dict.__getitem__(self, 'updated_parsed')
|
||||||
|
@ -119,7 +120,7 @@ class FeedParserDict(dict):
|
||||||
# This fix was proposed in issue 328.
|
# This fix was proposed in issue 328.
|
||||||
return dict.__contains__(self, key)
|
return dict.__contains__(self, key)
|
||||||
try:
|
try:
|
||||||
self.__getitem__(key)
|
self.__getitem__(key, _stacklevel=3)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
|
@ -133,7 +134,7 @@ class FeedParserDict(dict):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self.__getitem__(key)
|
return self.__getitem__(key, _stacklevel=3)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
@ -143,17 +144,11 @@ class FeedParserDict(dict):
|
||||||
key = key[0]
|
key = key[0]
|
||||||
return dict.__setitem__(self, key, value)
|
return dict.__setitem__(self, key, value)
|
||||||
|
|
||||||
def setdefault(self, k, default):
|
|
||||||
if k not in self:
|
|
||||||
self[k] = default
|
|
||||||
return default
|
|
||||||
return self[k]
|
|
||||||
|
|
||||||
def __getattr__(self, key):
|
def __getattr__(self, key):
|
||||||
# __getattribute__() is called first; this will be called
|
# __getattribute__() is called first; this will be called
|
||||||
# only if an attribute was not already found
|
# only if an attribute was not already found
|
||||||
try:
|
try:
|
||||||
return self.__getitem__(key)
|
return self.__getitem__(key, _stacklevel=3)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise AttributeError("object has no attribute '%s'" % key)
|
raise AttributeError("object has no attribute '%s'" % key)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue