from __future__ import absolute_import, unicode_literals import re try: from html.entities import name2codepoint except ImportError: from htmlentitydefs import name2codepoint from .sgml import * _cp1252 = { 128: '\u20ac', # euro sign 130: '\u201a', # single low-9 quotation mark 131: '\u0192', # latin small letter f with hook 132: '\u201e', # double low-9 quotation mark 133: '\u2026', # horizontal ellipsis 134: '\u2020', # dagger 135: '\u2021', # double dagger 136: '\u02c6', # modifier letter circumflex accent 137: '\u2030', # per mille sign 138: '\u0160', # latin capital letter s with caron 139: '\u2039', # single left-pointing angle quotation mark 140: '\u0152', # latin capital ligature oe 142: '\u017d', # latin capital letter z with caron 145: '\u2018', # left single quotation mark 146: '\u2019', # right single quotation mark 147: '\u201c', # left double quotation mark 148: '\u201d', # right double quotation mark 149: '\u2022', # bullet 150: '\u2013', # en dash 151: '\u2014', # em dash 152: '\u02dc', # small tilde 153: '\u2122', # trade mark sign 154: '\u0161', # latin small letter s with caron 155: '\u203a', # single right-pointing angle quotation mark 156: '\u0153', # latin small ligature oe 158: '\u017e', # latin small letter z with caron 159: '\u0178', # latin capital letter y with diaeresis } class _BaseHTMLProcessor(sgmllib.SGMLParser, object): special = re.compile('''[<>'"]''') bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") elements_no_end_tag = set([ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' ]) def __init__(self, encoding=None, _type='application/xhtml+xml'): if encoding: self.encoding = encoding self._type = _type super(_BaseHTMLProcessor, self).__init__() def reset(self): self.pieces = [] sgmllib.SGMLParser.reset(self) def _shorttag_replace(self, match): tag = match.group(1) if tag in self.elements_no_end_tag: return '<' + tag + ' />' else: return '<' + tag + '>' + tag + '>' # By declaring these methods and overriding their compiled code # with the code from sgmllib, the original code will execute in # feedparser's scope instead of sgmllib's. This means that the # `tagfind` and `charref` regular expressions will be found as # they're declared above, not as they're declared in sgmllib. def goahead(self, i): pass try: goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ except AttributeError: goahead.func_code = sgmllib.SGMLParser.goahead.func_code def __parse_starttag(self, i): pass try: __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ except AttributeError: __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code def parse_starttag(self,i): j = self.__parse_starttag(i) if self._type == 'application/xhtml+xml': if j>2 and self.rawdata[j-2:j]=='/>': self.unknown_endtag(self.lasttag) return j def feed(self, data): data = re.compile(r'\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') sgmllib.SGMLParser.feed(self, data) sgmllib.SGMLParser.close(self) def normalize_attrs(self, attrs): if not attrs: return attrs # utility method to be called by descendants attrs = dict([(k.lower(), v) for k, v in attrs]).items() attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] attrs.sort() return attrs def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')] uattrs = [] strattrs='' if attrs: for key, value in attrs: value=value.replace('>','>').replace('<','<').replace('"','"') value = self.bare_ampersand.sub("&", value) uattrs.append((key, value)) strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs]) if tag in self.elements_no_end_tag: self.pieces.append('<%s%s />' % (tag, strattrs)) else: self.pieces.append('<%s%s>' % (tag, strattrs)) def unknown_endtag(self, tag): # called for each end tag, e.g. for, tag will be 'pre' # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: self.pieces.append("%s>" % tag) def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. ref = ref.lower() if ref.startswith('x'): value = int(ref[1:], 16) else: value = int(ref) if value in _cp1252: self.pieces.append('%s;' % hex(ord(_cp1252[value]))[1:]) else: self.pieces.append('%s;' % ref) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if ref in name2codepoint or ref == 'apos': self.pieces.append('&%s;' % ref) else: self.pieces.append('&%s' % ref) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. self.pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. self.pieces.append('' % text) def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. self.pieces.append('%s>' % text) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE self.pieces.append('' % text) _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) if i == n: return None, -1 m = self._new_declname_match(rawdata, i) if m: s = m.group() name = s.strip() if (i + len(s)) == n: return None, -1 # end of buffer return name.lower(), m.end() else: self.handle_data(rawdata) # self.updatepos(declstartpos, i) return None, -1 def convert_charref(self, name): return '%s;' % name def convert_entityref(self, name): return '&%s;' % name def output(self): '''Return processed HTML as a single string''' return ''.join(self.pieces) def parse_declaration(self, i): try: return sgmllib.SGMLParser.parse_declaration(self, i) except sgmllib.SGMLParseError: # escape the doctype declaration and continue parsing self.handle_data('<') return i+1