SickGear/lib/feedparser/urls.py

# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import re
import urllib.parse

from .html import BaseHTMLProcessor

# If you want feedparser to allow all URL schemes, set this to ()
# List culled from Python's urlparse documentation at:
#   http://docs.python.org/library/urlparse.html
# as well as from "URI scheme" at Wikipedia:
#   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
# Many more will likely need to be added!
ACCEPTABLE_URI_SCHEMES = (
    "file",
    "ftp",
    "gopher",
    "h323",
    "hdl",
    "http",
    "https",
    "imap",
    "magnet",
    "mailto",
    "mms",
    "news",
    "nntp",
    "prospero",
    "rsync",
    "rtsp",
    "rtspu",
    "sftp",
    "shttp",
    "sip",
    "sips",
    "snews",
    "svn",
    "svn+ssh",
    "telnet",
    "wais",
    # Additional common-but-unofficial schemes
    "aim",
    "callto",
    "cvs",
    "facetime",
    "feed",
    "git",
    "gtalk",
    "irc",
    "ircs",
    "irc6",
    "itms",
    "mms",
    "msnim",
    "skype",
    "ssh",
    "smb",
    "svn",
    "ymsg",
)

_urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)")


def _urljoin(base, uri):
    uri = _urifixer.sub(r"\1\3", uri)
    try:
        uri = urllib.parse.urljoin(base, uri)
    except ValueError:
        uri = ""
    return uri


def make_safe_absolute_uri(base, rel=None):
    # bail if ACCEPTABLE_URI_SCHEMES is empty
    if not ACCEPTABLE_URI_SCHEMES:
        return _urljoin(base, rel or "")
    if not base:
        return rel or ""
    if not rel:
        try:
            scheme = urllib.parse.urlparse(base)[0]
        except ValueError:
            return ""
        if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
            return base
        return ""
    uri = _urljoin(base, rel)
    if uri.strip().split(":", 1)[0] not in ACCEPTABLE_URI_SCHEMES:
        return ""
    return uri


class RelativeURIResolver(BaseHTMLProcessor):
    relative_uris = {
        ("a", "href"),
        ("applet", "codebase"),
        ("area", "href"),
        ("audio", "src"),
        ("blockquote", "cite"),
        ("body", "background"),
        ("del", "cite"),
        ("form", "action"),
        ("frame", "longdesc"),
        ("frame", "src"),
        ("iframe", "longdesc"),
        ("iframe", "src"),
        ("head", "profile"),
        ("img", "longdesc"),
        ("img", "src"),
        ("img", "usemap"),
        ("input", "src"),
        ("input", "usemap"),
        ("ins", "cite"),
        ("link", "href"),
        ("object", "classid"),
        ("object", "codebase"),
        ("object", "data"),
        ("object", "usemap"),
        ("q", "cite"),
        ("script", "src"),
        ("source", "src"),
        ("video", "poster"),
        ("video", "src"),
    }

    def __init__(self, baseuri, encoding, _type):
        BaseHTMLProcessor.__init__(self, encoding, _type)
        self.baseuri = baseuri

    def resolve_uri(self, uri):
        return make_safe_absolute_uri(self.baseuri, uri.strip())

    def unknown_starttag(self, tag, attrs):
        attrs = self.normalize_attrs(attrs)
        attrs = [
            (
                key,
                ((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,
            )
            for key, value in attrs
        ]
        super().unknown_starttag(tag, attrs)


def resolve_relative_uris(html_source, base_uri, encoding, type_):
    p = RelativeURIResolver(base_uri, encoding, type_)
    p.feed(html_source)
    return p.output()