Merge branch 'feature/UpdateFeedparser' into dev

This commit is contained in:
JackDandy 2023-09-06 09:18:56 +01:00
commit 918340e474
5 changed files with 14 additions and 8 deletions

View file

@ -2,6 +2,7 @@
* Update Beautiful Soup 4.11.1 (r642) to 4.12.2 * Update Beautiful Soup 4.11.1 (r642) to 4.12.2
* Update certifi 2023.05.07 to 2023.07.22 * Update certifi 2023.05.07 to 2023.07.22
* Update feedparser 6.0.10 (859ac57) to 6.0.10 (9865dec)
* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb) * Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
* Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984) * Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984)
* Fix regex that was not using py312 notation * Fix regex that was not using py312 notation

View file

@ -335,7 +335,7 @@ def convert_to_utf8(
# How much to read from a binary file in order to detect encoding. # How much to read from a binary file in order to detect encoding.
# In inital tests, 4k was enough for ~160 mostly-English feeds; # In initial tests, 4k was enough for ~160 mostly-English feeds;
# 64k seems like a safe margin. # 64k seems like a safe margin.
CONVERT_FILE_PREFIX_LEN = 2**16 CONVERT_FILE_PREFIX_LEN = 2**16

View file

@ -152,7 +152,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
:rtype: None :rtype: None
""" """
data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, re.IGNORECASE) data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, flags=re.IGNORECASE)
data = re.sub(r"<([^<>\s]+?)\s*/>", self._shorttag_replace, data) data = re.sub(r"<([^<>\s]+?)\s*/>", self._shorttag_replace, data)
data = data.replace("&#39;", "'") data = data.replace("&#39;", "'")
data = data.replace("&#34;", '"') data = data.replace("&#34;", '"')

View file

@ -192,6 +192,7 @@ class XMLParserMixin(
self.incontributor = 0 self.incontributor = 0
self.inpublisher = 0 self.inpublisher = 0
self.insource = 0 self.insource = 0
self.isentrylink = 0
self.sourcedata = FeedParserDict() self.sourcedata = FeedParserDict()
self.contentparams = FeedParserDict() self.contentparams = FeedParserDict()
@ -233,7 +234,7 @@ class XMLParserMixin(
if isinstance(baseuri, bytes): if isinstance(baseuri, bytes):
baseuri = baseuri.decode(self.encoding, "ignore") baseuri = baseuri.decode(self.encoding, "ignore")
# ensure that self.baseuri is always an absolute URI that # ensure that self.baseuri is always an absolute URI that
# uses a whitelisted URI scheme (e.g. not `javscript:`) # uses a whitelisted URI scheme (e.g. not `javascript:`)
if self.baseuri: if self.baseuri:
self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
else: else:
@ -624,6 +625,7 @@ class XMLParserMixin(
# unhandled character references. fix this special case. # unhandled character references. fix this special case.
output = output.replace("&amp;", "&") output = output.replace("&amp;", "&")
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
if self.isentrylink or not self.entries[-1].get(element):
self.entries[-1][element] = output self.entries[-1][element] = output
if output: if output:
self.entries[-1]["links"][-1]["href"] = output self.entries[-1]["links"][-1]["href"] = output

View file

@ -361,21 +361,24 @@ class Namespace:
attrs_d = self._enforce_href(attrs_d) attrs_d = self._enforce_href(attrs_d)
if "href" in attrs_d: if "href" in attrs_d:
attrs_d["href"] = self.resolve_uri(attrs_d["href"]) attrs_d["href"] = self.resolve_uri(attrs_d["href"])
if (
attrs_d.get("rel") == "alternate"
and self.map_content_type(attrs_d.get("type")) in self.html_types
):
self.isentrylink = 1
expecting_text = self.infeed or self.inentry or self.insource expecting_text = self.infeed or self.inentry or self.insource
context.setdefault("links", []) context.setdefault("links", [])
if not (self.inentry and self.inimage): if not (self.inentry and self.inimage):
context["links"].append(FeedParserDict(attrs_d)) context["links"].append(FeedParserDict(attrs_d))
if "href" in attrs_d: if "href" in attrs_d:
if ( if self.isentrylink:
attrs_d.get("rel") == "alternate"
and self.map_content_type(attrs_d.get("type")) in self.html_types
):
context["link"] = attrs_d["href"] context["link"] = attrs_d["href"]
else: else:
self.push("link", expecting_text) self.push("link", expecting_text)
def _end_link(self): def _end_link(self):
self.pop("link") self.pop("link")
self.isentrylink = 0
def _start_guid(self, attrs_d): def _start_guid(self, attrs_d):
self.guidislink = attrs_d.get("ispermalink", "true") == "true" self.guidislink = attrs_d.get("ispermalink", "true") == "true"