2023-04-13 07:04:58 +00:00
|
|
|
# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
|
2023-01-12 01:04:47 +00:00
|
|
|
# Copyright 2002-2008 Mark Pilgrim
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# This file is a part of feedparser.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
|
|
# and/or other materials provided with the distribution.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
import sgmllib3k as sgmllib
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
__all__ = [
|
2023-04-13 07:04:58 +00:00
|
|
|
"sgmllib",
|
|
|
|
"charref",
|
|
|
|
"tagfind",
|
|
|
|
"attrfind",
|
|
|
|
"entityref",
|
|
|
|
"incomplete",
|
|
|
|
"interesting",
|
|
|
|
"shorttag",
|
|
|
|
"shorttagopen",
|
|
|
|
"starttagopen",
|
|
|
|
"endbracket",
|
2023-01-12 01:04:47 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
# sgmllib defines a number of module-level regular expressions that are
|
|
|
|
# insufficient for the XML parsing feedparser needs. Rather than modify
|
|
|
|
# the variables directly in sgmllib, they're defined here using the same
|
|
|
|
# names, and the compiled code objects of several sgmllib.SGMLParser
|
|
|
|
# methods are copied into _BaseHTMLProcessor so that they execute in
|
|
|
|
# feedparser's scope instead of sgmllib's scope.
|
2023-04-13 07:04:58 +00:00
|
|
|
charref = re.compile(r"&#(\d+|[xX][0-9a-fA-F]+);")
|
|
|
|
tagfind = re.compile(r"[a-zA-Z][-_.:a-zA-Z0-9]*")
|
2023-01-12 01:04:47 +00:00
|
|
|
attrfind = re.compile(
|
|
|
|
r"""\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*"""
|
|
|
|
r"""('[^']*'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$()_#=~'"@]*))?"""
|
|
|
|
)
|
|
|
|
|
|
|
|
# Unfortunately, these must be copied over to prevent NameError exceptions
|
2023-04-13 07:04:58 +00:00
|
|
|
entityref = sgmllib.SGMLParser.entityref
|
|
|
|
incomplete = sgmllib.SGMLParser.incomplete
|
|
|
|
interesting = sgmllib.SGMLParser.interesting
|
|
|
|
shorttag = sgmllib.SGMLParser.shorttag
|
|
|
|
shorttagopen = sgmllib.SGMLParser.shorttagopen
|
|
|
|
starttagopen = sgmllib.SGMLParser.starttagopen
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
|
|
|
|
class _EndBracketRegEx:
|
|
|
|
def __init__(self):
|
|
|
|
# Overriding the built-in sgmllib.endbracket regex allows the
|
|
|
|
# parser to find angle brackets embedded in element attributes.
|
|
|
|
self.endbracket = re.compile(
|
2023-04-13 07:04:58 +00:00
|
|
|
r"("
|
2023-01-12 01:04:47 +00:00
|
|
|
r"""[^'"<>]"""
|
|
|
|
r"""|"[^"]*"(?=>|/|\s|\w+=)"""
|
|
|
|
r"""|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])"""
|
|
|
|
r"""|.*?(?=[<>]"""
|
2023-04-13 07:04:58 +00:00
|
|
|
r")"
|
2023-01-12 01:04:47 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
def search(self, target, index=0):
|
|
|
|
match = self.endbracket.match(target, index)
|
|
|
|
if match is not None:
|
|
|
|
# Returning a new object in the calling thread's context
|
2023-01-13 20:16:45 +00:00
|
|
|
# resolves a thread-safety issue.
|
2023-01-12 01:04:47 +00:00
|
|
|
return EndBracketMatch(match)
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
class EndBracketMatch:
|
|
|
|
def __init__(self, match):
|
|
|
|
self.match = match
|
|
|
|
|
|
|
|
def start(self, n):
|
|
|
|
return self.match.end(n)
|
|
|
|
|
|
|
|
|
|
|
|
endbracket = _EndBracketRegEx()
|