from __future__ import absolute_import, unicode_literals import re try: import urllib.parse as urlparse except ImportError: import urlparse as urlparse from .html import _BaseHTMLProcessor # If you want feedparser to allow all URL schemes, set this to () # List culled from Python's urlparse documentation at: # http://docs.python.org/library/urlparse.html # as well as from "URI scheme" at Wikipedia: # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # Many more will likely need to be added! ACCEPTABLE_URI_SCHEMES = ( 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais', # Additional common-but-unofficial schemes 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', ) #ACCEPTABLE_URI_SCHEMES = () _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub(r'\1\3', uri) try: uri = urlparse.urljoin(base, uri) except ValueError: uri = '' return uri def _convert_to_idn(url): """Convert a URL to IDN notation""" # this function should only be called with a unicode string # strategy: if the host cannot be encoded in ascii, then # it'll be necessary to encode it in idn form parts = list(urlparse.urlsplit(url)) try: parts[1].encode('ascii') except UnicodeEncodeError: # the url needs to be converted to idn notation host = parts[1].rsplit(':', 1) newhost = [] port = '' if len(host) == 2: port = host.pop() for h in host[0].split('.'): newhost.append(h.encode('idna').decode('utf-8')) parts[1] = '.'.join(newhost) if port: parts[1] += ':' + port return urlparse.urlunsplit(parts) else: return url def _makeSafeAbsoluteURI(base, rel=None): # bail if ACCEPTABLE_URI_SCHEMES is empty if not ACCEPTABLE_URI_SCHEMES: return _urljoin(base, rel or '') if not base: return rel or '' if not rel: try: scheme = urlparse.urlparse(base)[0] except ValueError: return '' if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: return base return '' uri = _urljoin(base, rel) if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: return '' return uri class _RelativeURIResolver(_BaseHTMLProcessor): relative_uris = set([('a', 'href'), ('applet', 'codebase'), ('area', 'href'), ('audio', 'src'), ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), ('form', 'action'), ('frame', 'longdesc'), ('frame', 'src'), ('iframe', 'longdesc'), ('iframe', 'src'), ('head', 'profile'), ('img', 'longdesc'), ('img', 'src'), ('img', 'usemap'), ('input', 'src'), ('input', 'usemap'), ('ins', 'cite'), ('link', 'href'), ('object', 'classid'), ('object', 'codebase'), ('object', 'data'), ('object', 'usemap'), ('q', 'cite'), ('script', 'src'), ('source', 'src'), ('video', 'poster'), ('video', 'src')]) def __init__(self, baseuri, encoding, _type): _BaseHTMLProcessor.__init__(self, encoding, _type) self.baseuri = baseuri def resolveURI(self, uri): return _makeSafeAbsoluteURI(self.baseuri, uri.strip()) def unknown_starttag(self, tag, attrs): attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): # if not _SGML_AVAILABLE: # return htmlSource p = _RelativeURIResolver(baseURI, encoding, _type) p.feed(htmlSource) return p.output()