SickGear/lib/feedparser/urls.py

# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import re
import urllib.parse

from .html import BaseHTMLProcessor

# If you want feedparser to allow all URL schemes, set this to ()
# List culled from Python's urlparse documentation at:
#   http://docs.python.org/library/urlparse.html
# as well as from "URI scheme" at Wikipedia:
#   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
# Many more will likely need to be added!
ACCEPTABLE_URI_SCHEMES = (
    "file",
    "ftp",
    "gopher",
    "h323",
    "hdl",
    "http",
    "https",
    "imap",
    "magnet",
    "mailto",
    "mms",
    "news",
    "nntp",
    "prospero",
    "rsync",
    "rtsp",
    "rtspu",
    "sftp",
    "shttp",
    "sip",
    "sips",
    "snews",
    "svn",
    "svn+ssh",
    "telnet",
    "wais",
    # Additional common-but-unofficial schemes
    "aim",
    "callto",
    "cvs",
    "facetime",
    "feed",
    "git",
    "gtalk",
    "irc",
    "ircs",
    "irc6",
    "itms",
    "mms",
    "msnim",
    "skype",
    "ssh",
    "smb",
    "svn",
    "ymsg",
)

_urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)")


def _urljoin(base, uri):
    uri = _urifixer.sub(r"\1\3", uri)
    try:
        uri = urllib.parse.urljoin(base, uri)
    except ValueError:
        uri = ""
    return uri


def make_safe_absolute_uri(base, rel=None):
    # bail if ACCEPTABLE_URI_SCHEMES is empty
    if not ACCEPTABLE_URI_SCHEMES:
        return _urljoin(base, rel or "")
    if not base:
        return rel or ""
    if not rel:
        try:
            scheme = urllib.parse.urlparse(base)[0]
        except ValueError:
            return ""
        if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
            return base
        return ""
    uri = _urljoin(base, rel)
    if uri.strip().split(":", 1)[0] not in ACCEPTABLE_URI_SCHEMES:
        return ""
    return uri


class RelativeURIResolver(BaseHTMLProcessor):
    relative_uris = {
        ("a", "href"),
        ("applet", "codebase"),
        ("area", "href"),
        ("audio", "src"),
        ("blockquote", "cite"),
        ("body", "background"),
        ("del", "cite"),
        ("form", "action"),
        ("frame", "longdesc"),
        ("frame", "src"),
        ("iframe", "longdesc"),
        ("iframe", "src"),
        ("head", "profile"),
        ("img", "longdesc"),
        ("img", "src"),
        ("img", "usemap"),
        ("input", "src"),
        ("input", "usemap"),
        ("ins", "cite"),
        ("link", "href"),
        ("object", "classid"),
        ("object", "codebase"),
        ("object", "data"),
        ("object", "usemap"),
        ("q", "cite"),
        ("script", "src"),
        ("source", "src"),
        ("video", "poster"),
        ("video", "src"),
    }

    def __init__(self, baseuri, encoding, _type):
        BaseHTMLProcessor.__init__(self, encoding, _type)
        self.baseuri = baseuri

    def resolve_uri(self, uri):
        return make_safe_absolute_uri(self.baseuri, uri.strip())

    def unknown_starttag(self, tag, attrs):
        attrs = self.normalize_attrs(attrs)
        attrs = [
            (
                key,
                ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,
            )
            for key, value in attrs
        ]
        super().unknown_starttag(tag, attrs)


def resolve_relative_uris(html_source, base_uri, encoding, type_):
    p = RelativeURIResolver(base_uri, encoding, type_)
    p.feed(html_source)
    return p.output()
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`# Copyright 2002-2008 Mark Pilgrim`
			`# All rights reserved.`
			`#`
			`# This file is a part of feedparser.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions and the following disclaimer in the documentation`
			`# and/or other materials provided with the distribution.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`

			`import re`
			`import urllib.parse`

Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`from .html import BaseHTMLProcessor`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
			`# If you want feedparser to allow all URL schemes, set this to ()`
			`# List culled from Python's urlparse documentation at:`
			`# http://docs.python.org/library/urlparse.html`
			`# as well as from "URI scheme" at Wikipedia:`
			`# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme`
			`# Many more will likely need to be added!`
			`ACCEPTABLE_URI_SCHEMES = (`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`"file",`
			`"ftp",`
			`"gopher",`
			`"h323",`
			`"hdl",`
			`"http",`
			`"https",`
			`"imap",`
			`"magnet",`
			`"mailto",`
			`"mms",`
			`"news",`
			`"nntp",`
			`"prospero",`
			`"rsync",`
			`"rtsp",`
			`"rtspu",`
			`"sftp",`
			`"shttp",`
			`"sip",`
			`"sips",`
			`"snews",`
			`"svn",`
			`"svn+ssh",`
			`"telnet",`
			`"wais",`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`# Additional common-but-unofficial schemes`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`"aim",`
			`"callto",`
			`"cvs",`
			`"facetime",`
			`"feed",`
			`"git",`
			`"gtalk",`
			`"irc",`
			`"ircs",`
			`"irc6",`
			`"itms",`
			`"mms",`
			`"msnim",`
			`"skype",`
			`"ssh",`
			`"smb",`
			`"svn",`
			`"ymsg",`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`)`

Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`_urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]://)(/)(.*?)")`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00

			`def _urljoin(base, uri):`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`uri = _urifixer.sub(r"\1\3", uri)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`try:`
			`uri = urllib.parse.urljoin(base, uri)`
			`except ValueError:`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`uri = ""`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`return uri`


			`def make_safe_absolute_uri(base, rel=None):`
			`# bail if ACCEPTABLE_URI_SCHEMES is empty`
			`if not ACCEPTABLE_URI_SCHEMES:`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`return _urljoin(base, rel or "")`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`if not base:`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`return rel or ""`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`if not rel:`
			`try:`
			`scheme = urllib.parse.urlparse(base)[0]`
			`except ValueError:`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`return ""`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:`
			`return base`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`return ""`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`uri = _urljoin(base, rel)`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`if uri.strip().split(":", 1)[0] not in ACCEPTABLE_URI_SCHEMES:`
			`return ""`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`return uri`


Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`class RelativeURIResolver(BaseHTMLProcessor):`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`relative_uris = {`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`("a", "href"),`
			`("applet", "codebase"),`
			`("area", "href"),`
			`("audio", "src"),`
			`("blockquote", "cite"),`
			`("body", "background"),`
			`("del", "cite"),`
			`("form", "action"),`
			`("frame", "longdesc"),`
			`("frame", "src"),`
			`("iframe", "longdesc"),`
			`("iframe", "src"),`
			`("head", "profile"),`
			`("img", "longdesc"),`
			`("img", "src"),`
			`("img", "usemap"),`
			`("input", "src"),`
			`("input", "usemap"),`
			`("ins", "cite"),`
			`("link", "href"),`
			`("object", "classid"),`
			`("object", "codebase"),`
			`("object", "data"),`
			`("object", "usemap"),`
			`("q", "cite"),`
			`("script", "src"),`
			`("source", "src"),`
			`("video", "poster"),`
			`("video", "src"),`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`}`

			`def __init__(self, baseuri, encoding, _type):`
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`BaseHTMLProcessor.__init__(self, encoding, _type)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`self.baseuri = baseuri`

			`def resolve_uri(self, uri):`
			`return make_safe_absolute_uri(self.baseuri, uri.strip())`

			`def unknown_starttag(self, tag, attrs):`
			`attrs = self.normalize_attrs(attrs)`
Update feedparser 6.0.10 (6d032b8) → 6.0.10 (859ac57). 2023-04-13 07:04:58 +00:00			`attrs = [`
			`(`
			`key,`
			`((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,`
			`)`
			`for key, value in attrs`
			`]`
			`super().unknown_starttag(tag, attrs)`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00

			`def resolve_relative_uris(html_source, base_uri, encoding, type_):`
			`p = RelativeURIResolver(base_uri, encoding, type_)`
			`p.feed(html_source)`
			`return p.output()`