diff --git a/CHANGES.md b/CHANGES.md index e650a569..0a333cc6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ * Update Beautiful Soup 4.11.1 (r642) to 4.12.2 * Update certifi 2023.05.07 to 2023.07.22 +* Update feedparser 6.0.10 (859ac57) to 6.0.10 (9865dec) * Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb) * Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984) * Fix regex that was not using py312 notation diff --git a/lib/feedparser/encodings.py b/lib/feedparser/encodings.py index a7be68ae..01f228d1 100644 --- a/lib/feedparser/encodings.py +++ b/lib/feedparser/encodings.py @@ -335,7 +335,7 @@ def convert_to_utf8( # How much to read from a binary file in order to detect encoding. -# In inital tests, 4k was enough for ~160 mostly-English feeds; +# In initial tests, 4k was enough for ~160 mostly-English feeds; # 64k seems like a safe margin. CONVERT_FILE_PREFIX_LEN = 2**16 diff --git a/lib/feedparser/html.py b/lib/feedparser/html.py index bbb90389..826e4a11 100644 --- a/lib/feedparser/html.py +++ b/lib/feedparser/html.py @@ -152,7 +152,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser): :rtype: None """ - data = re.sub(r"\s]+?)\s*/>", self._shorttag_replace, data) data = data.replace("'", "'") data = data.replace(""", '"') diff --git a/lib/feedparser/mixin.py b/lib/feedparser/mixin.py index 4c6d4e9a..2a649f1b 100644 --- a/lib/feedparser/mixin.py +++ b/lib/feedparser/mixin.py @@ -192,6 +192,7 @@ class XMLParserMixin( self.incontributor = 0 self.inpublisher = 0 self.insource = 0 + self.isentrylink = 0 self.sourcedata = FeedParserDict() self.contentparams = FeedParserDict() @@ -233,7 +234,7 @@ class XMLParserMixin( if isinstance(baseuri, bytes): baseuri = baseuri.decode(self.encoding, "ignore") # ensure that self.baseuri is always an absolute URI that - # uses a whitelisted URI scheme (e.g. not `javscript:`) + # uses a whitelisted URI scheme (e.g. not `javascript:`) if self.baseuri: self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri else: @@ -624,7 +625,8 @@ class XMLParserMixin( # unhandled character references. fix this special case. output = output.replace("&", "&") output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output) - self.entries[-1][element] = output + if self.isentrylink or not self.entries[-1].get(element): + self.entries[-1][element] = output if output: self.entries[-1]["links"][-1]["href"] = output else: diff --git a/lib/feedparser/namespaces/_base.py b/lib/feedparser/namespaces/_base.py index 1fc3ee30..c5521f62 100644 --- a/lib/feedparser/namespaces/_base.py +++ b/lib/feedparser/namespaces/_base.py @@ -361,21 +361,24 @@ class Namespace: attrs_d = self._enforce_href(attrs_d) if "href" in attrs_d: attrs_d["href"] = self.resolve_uri(attrs_d["href"]) + if ( + attrs_d.get("rel") == "alternate" + and self.map_content_type(attrs_d.get("type")) in self.html_types + ): + self.isentrylink = 1 expecting_text = self.infeed or self.inentry or self.insource context.setdefault("links", []) if not (self.inentry and self.inimage): context["links"].append(FeedParserDict(attrs_d)) if "href" in attrs_d: - if ( - attrs_d.get("rel") == "alternate" - and self.map_content_type(attrs_d.get("type")) in self.html_types - ): + if self.isentrylink: context["link"] = attrs_d["href"] else: self.push("link", expecting_text) def _end_link(self): self.pop("link") + self.isentrylink = 0 def _start_guid(self, attrs_d): self.guidislink = attrs_d.get("ispermalink", "true") == "true"