mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-01 00:43:37 +00:00
132 lines
4.5 KiB
Python
132 lines
4.5 KiB
Python
|
from __future__ import absolute_import, unicode_literals
|
||
|
|
||
|
import re
|
||
|
|
||
|
try:
|
||
|
import urllib.parse as urlparse
|
||
|
except ImportError:
|
||
|
import urlparse as urlparse
|
||
|
|
||
|
from .html import _BaseHTMLProcessor
|
||
|
|
||
|
# If you want feedparser to allow all URL schemes, set this to ()
|
||
|
# List culled from Python's urlparse documentation at:
|
||
|
# http://docs.python.org/library/urlparse.html
|
||
|
# as well as from "URI scheme" at Wikipedia:
|
||
|
# https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
|
||
|
# Many more will likely need to be added!
|
||
|
ACCEPTABLE_URI_SCHEMES = (
|
||
|
'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
|
||
|
'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
|
||
|
'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
|
||
|
'wais',
|
||
|
# Additional common-but-unofficial schemes
|
||
|
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
|
||
|
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
|
||
|
)
|
||
|
#ACCEPTABLE_URI_SCHEMES = ()
|
||
|
|
||
|
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
|
||
|
def _urljoin(base, uri):
|
||
|
uri = _urifixer.sub(r'\1\3', uri)
|
||
|
try:
|
||
|
uri = urlparse.urljoin(base, uri)
|
||
|
except ValueError:
|
||
|
uri = ''
|
||
|
return uri
|
||
|
|
||
|
def _convert_to_idn(url):
|
||
|
"""Convert a URL to IDN notation"""
|
||
|
# this function should only be called with a unicode string
|
||
|
# strategy: if the host cannot be encoded in ascii, then
|
||
|
# it'll be necessary to encode it in idn form
|
||
|
parts = list(urlparse.urlsplit(url))
|
||
|
try:
|
||
|
parts[1].encode('ascii')
|
||
|
except UnicodeEncodeError:
|
||
|
# the url needs to be converted to idn notation
|
||
|
host = parts[1].rsplit(':', 1)
|
||
|
newhost = []
|
||
|
port = ''
|
||
|
if len(host) == 2:
|
||
|
port = host.pop()
|
||
|
for h in host[0].split('.'):
|
||
|
newhost.append(h.encode('idna').decode('utf-8'))
|
||
|
parts[1] = '.'.join(newhost)
|
||
|
if port:
|
||
|
parts[1] += ':' + port
|
||
|
return urlparse.urlunsplit(parts)
|
||
|
else:
|
||
|
return url
|
||
|
|
||
|
def _makeSafeAbsoluteURI(base, rel=None):
|
||
|
# bail if ACCEPTABLE_URI_SCHEMES is empty
|
||
|
if not ACCEPTABLE_URI_SCHEMES:
|
||
|
return _urljoin(base, rel or '')
|
||
|
if not base:
|
||
|
return rel or ''
|
||
|
if not rel:
|
||
|
try:
|
||
|
scheme = urlparse.urlparse(base)[0]
|
||
|
except ValueError:
|
||
|
return ''
|
||
|
if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
|
||
|
return base
|
||
|
return ''
|
||
|
uri = _urljoin(base, rel)
|
||
|
if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
|
||
|
return ''
|
||
|
return uri
|
||
|
|
||
|
class _RelativeURIResolver(_BaseHTMLProcessor):
|
||
|
relative_uris = set([('a', 'href'),
|
||
|
('applet', 'codebase'),
|
||
|
('area', 'href'),
|
||
|
('audio', 'src'),
|
||
|
('blockquote', 'cite'),
|
||
|
('body', 'background'),
|
||
|
('del', 'cite'),
|
||
|
('form', 'action'),
|
||
|
('frame', 'longdesc'),
|
||
|
('frame', 'src'),
|
||
|
('iframe', 'longdesc'),
|
||
|
('iframe', 'src'),
|
||
|
('head', 'profile'),
|
||
|
('img', 'longdesc'),
|
||
|
('img', 'src'),
|
||
|
('img', 'usemap'),
|
||
|
('input', 'src'),
|
||
|
('input', 'usemap'),
|
||
|
('ins', 'cite'),
|
||
|
('link', 'href'),
|
||
|
('object', 'classid'),
|
||
|
('object', 'codebase'),
|
||
|
('object', 'data'),
|
||
|
('object', 'usemap'),
|
||
|
('q', 'cite'),
|
||
|
('script', 'src'),
|
||
|
('source', 'src'),
|
||
|
('video', 'poster'),
|
||
|
('video', 'src')])
|
||
|
|
||
|
def __init__(self, baseuri, encoding, _type):
|
||
|
_BaseHTMLProcessor.__init__(self, encoding, _type)
|
||
|
self.baseuri = baseuri
|
||
|
|
||
|
def resolveURI(self, uri):
|
||
|
return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
|
||
|
|
||
|
def unknown_starttag(self, tag, attrs):
|
||
|
attrs = self.normalize_attrs(attrs)
|
||
|
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
|
||
|
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
|
||
|
|
||
|
def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
|
||
|
# if not _SGML_AVAILABLE:
|
||
|
# return htmlSource
|
||
|
|
||
|
p = _RelativeURIResolver(baseURI, encoding, _type)
|
||
|
p.feed(htmlSource)
|
||
|
return p.output()
|
||
|
|