SickGear/lib/feedparser/urls.py

174 lines
4.8 KiB
Python

# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
import urllib.parse
from .html import BaseHTMLProcessor
# If you want feedparser to allow all URL schemes, set this to ()
# List culled from Python's urlparse documentation at:
# http://docs.python.org/library/urlparse.html
# as well as from "URI scheme" at Wikipedia:
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
# Many more will likely need to be added!
ACCEPTABLE_URI_SCHEMES = (
"file",
"ftp",
"gopher",
"h323",
"hdl",
"http",
"https",
"imap",
"magnet",
"mailto",
"mms",
"news",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"snews",
"svn",
"svn+ssh",
"telnet",
"wais",
# Additional common-but-unofficial schemes
"aim",
"callto",
"cvs",
"facetime",
"feed",
"git",
"gtalk",
"irc",
"ircs",
"irc6",
"itms",
"mms",
"msnim",
"skype",
"ssh",
"smb",
"svn",
"ymsg",
)
_urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)")
def _urljoin(base, uri):
uri = _urifixer.sub(r"\1\3", uri)
try:
uri = urllib.parse.urljoin(base, uri)
except ValueError:
uri = ""
return uri
def make_safe_absolute_uri(base, rel=None):
# bail if ACCEPTABLE_URI_SCHEMES is empty
if not ACCEPTABLE_URI_SCHEMES:
return _urljoin(base, rel or "")
if not base:
return rel or ""
if not rel:
try:
scheme = urllib.parse.urlparse(base)[0]
except ValueError:
return ""
if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
return base
return ""
uri = _urljoin(base, rel)
if uri.strip().split(":", 1)[0] not in ACCEPTABLE_URI_SCHEMES:
return ""
return uri
class RelativeURIResolver(BaseHTMLProcessor):
relative_uris = {
("a", "href"),
("applet", "codebase"),
("area", "href"),
("audio", "src"),
("blockquote", "cite"),
("body", "background"),
("del", "cite"),
("form", "action"),
("frame", "longdesc"),
("frame", "src"),
("iframe", "longdesc"),
("iframe", "src"),
("head", "profile"),
("img", "longdesc"),
("img", "src"),
("img", "usemap"),
("input", "src"),
("input", "usemap"),
("ins", "cite"),
("link", "href"),
("object", "classid"),
("object", "codebase"),
("object", "data"),
("object", "usemap"),
("q", "cite"),
("script", "src"),
("source", "src"),
("video", "poster"),
("video", "src"),
}
def __init__(self, baseuri, encoding, _type):
BaseHTMLProcessor.__init__(self, encoding, _type)
self.baseuri = baseuri
def resolve_uri(self, uri):
return make_safe_absolute_uri(self.baseuri, uri.strip())
def unknown_starttag(self, tag, attrs):
attrs = self.normalize_attrs(attrs)
attrs = [
(
key,
((tag, key) in self.relative_uris) and self.resolve_uri(value) or value,
)
for key, value in attrs
]
super().unknown_starttag(tag, attrs)
def resolve_relative_uris(html_source, base_uri, encoding, type_):
p = RelativeURIResolver(base_uri, encoding, type_)
p.feed(html_source)
return p.output()