SickGear/lib/feedparser/sgml.py

76 lines
2.6 KiB
Python

from __future__ import absolute_import
import re
__all__ = [
'_SGML_AVAILABLE',
'sgmllib',
'charref',
'tagfind',
'attrfind',
'entityref',
'incomplete',
'interesting',
'shorttag',
'shorttagopen',
'starttagopen',
'endbracket',
]
# sgmllib is not available by default in Python 3; if the end user doesn't have
# it available then we'll lose illformed XML parsing and content sanitizing
try:
import sgmllib
except ImportError:
# This is probably Python 3, which doesn't include sgmllib anymore
_SGML_AVAILABLE = 0
# Mock sgmllib enough to allow subclassing later on
class sgmllib(object):
class SGMLParser(object):
def goahead(self, i):
pass
def parse_starttag(self, i):
pass
else:
_SGML_AVAILABLE = 1
# sgmllib defines a number of module-level regular expressions that are
# insufficient for the XML parsing feedparser needs. Rather than modify
# the variables directly in sgmllib, they're defined here using the same
# names, and the compiled code objects of several sgmllib.SGMLParser
# methods are copied into _BaseHTMLProcessor so that they execute in
# feedparser's scope instead of sgmllib's scope.
charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
)
# Unfortunately, these must be copied over to prevent NameError exceptions
entityref = sgmllib.entityref
incomplete = sgmllib.incomplete
interesting = sgmllib.interesting
shorttag = sgmllib.shorttag
shorttagopen = sgmllib.shorttagopen
starttagopen = sgmllib.starttagopen
class _EndBracketRegEx:
def __init__(self):
# Overriding the built-in sgmllib.endbracket regex allows the
# parser to find angle brackets embedded in element attributes.
self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
def search(self, target, index=0):
match = self.endbracket.match(target, index)
if match is not None:
# Returning a new object in the calling thread's context
# resolves a thread-safety.
return EndBracketMatch(match)
return None
class EndBracketMatch:
def __init__(self, match):
self.match = match
def start(self, n):
return self.match.end(n)
endbracket = _EndBracketRegEx()