Merge branch 'feature/UpdateFeedparser' into dev

2024-12-19 09:13:37 +00:00 · 2023-09-06 09:18:56 +01:00 · 2023-09-06 09:18:56 +01:00 · 918340e474
commit 918340e474
parent 4f63362995 dbdcc3c52a
5 changed files with 14 additions and 8 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -2,6 +2,7 @@
 * Update Beautiful Soup 4.11.1 (r642) to 4.12.2
 * Update certifi 2023.05.07 to 2023.07.22
 * Update feedparser 6.0.10 (859ac57) to 6.0.10 (9865dec)
 * Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
 * Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984)
 * Fix regex that was not using py312 notation
--- a/lib/feedparser/encodings.py
+++ b/lib/feedparser/encodings.py
@ -335,7 +335,7 @@ def convert_to_utf8(
 # How much to read from a binary file in order to detect encoding.
-# In inital tests, 4k was enough for ~160 mostly-English feeds;
+# In initial tests, 4k was enough for ~160 mostly-English feeds;
 # 64k seems like a safe margin.
 CONVERT_FILE_PREFIX_LEN = 2**16
--- a/lib/feedparser/html.py
+++ b/lib/feedparser/html.py
@ -152,7 +152,7 @@ class BaseHTMLProcessor(sgmllib.SGMLParser):
        :rtype: None
        """
-        data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, re.IGNORECASE)
+        data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, flags=re.IGNORECASE)
        data = re.sub(r"<([^<>\s]+?)\s*/>", self._shorttag_replace, data)
        data = data.replace("&#39;", "'")
        data = data.replace("&#34;", '"')
--- a/lib/feedparser/mixin.py
+++ b/lib/feedparser/mixin.py
@ -192,6 +192,7 @@ class XMLParserMixin(
        self.incontributor = 0
        self.inpublisher = 0
        self.insource = 0
        self.isentrylink = 0
        self.sourcedata = FeedParserDict()
        self.contentparams = FeedParserDict()
@ -233,7 +234,7 @@ class XMLParserMixin(
        if isinstance(baseuri, bytes):
            baseuri = baseuri.decode(self.encoding, "ignore")
        # ensure that self.baseuri is always an absolute URI that
-        # uses a whitelisted URI scheme (e.g. not `javscript:`)
+        # uses a whitelisted URI scheme (e.g. not `javascript:`)
        if self.baseuri:
            self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
        else:
@ -624,7 +625,8 @@ class XMLParserMixin(
                    # unhandled character references. fix this special case.
                    output = output.replace("&amp;", "&")
                    output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
-                    self.entries[-1][element] = output
+                    if self.isentrylink or not self.entries[-1].get(element):
                        self.entries[-1][element] = output
                    if output:
                        self.entries[-1]["links"][-1]["href"] = output
            else:
--- a/lib/feedparser/namespaces/_base.py
+++ b/lib/feedparser/namespaces/_base.py
@ -361,21 +361,24 @@ class Namespace:
        attrs_d = self._enforce_href(attrs_d)
        if "href" in attrs_d:
            attrs_d["href"] = self.resolve_uri(attrs_d["href"])
        if (
            attrs_d.get("rel") == "alternate"
            and self.map_content_type(attrs_d.get("type")) in self.html_types
        ):
            self.isentrylink = 1
        expecting_text = self.infeed or self.inentry or self.insource
        context.setdefault("links", [])
        if not (self.inentry and self.inimage):
            context["links"].append(FeedParserDict(attrs_d))
        if "href" in attrs_d:
-            if (
+            if self.isentrylink:
                attrs_d.get("rel") == "alternate"
                and self.map_content_type(attrs_d.get("type")) in self.html_types
            ):
                context["link"] = attrs_d["href"]
        else:
            self.push("link", expecting_text)
    def _end_link(self):
        self.pop("link")
        self.isentrylink = 0
    def _start_guid(self, attrs_d):
        self.guidislink = attrs_d.get("ispermalink", "true") == "true"