2023-01-12 01:04:47 +00:00
|
|
|
# The loose feed parser that interfaces with an SGML parsing library
|
2023-04-13 07:04:58 +00:00
|
|
|
# Copyright 2010-2023 Kurt McKee <contactme@kurtmckee.org>
|
2023-01-12 01:04:47 +00:00
|
|
|
# Copyright 2002-2008 Mark Pilgrim
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# This file is a part of feedparser.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without modification,
|
|
|
|
# are permitted provided that the following conditions are met:
|
|
|
|
#
|
|
|
|
# * Redistributions of source code must retain the above copyright notice,
|
|
|
|
# this list of conditions and the following disclaimer.
|
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
|
|
# and/or other materials provided with the distribution.
|
|
|
|
#
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
2023-04-13 07:04:58 +00:00
|
|
|
|
2023-01-13 20:16:45 +00:00
|
|
|
class LooseXMLParser:
|
2023-01-12 01:04:47 +00:00
|
|
|
contentparams = None
|
|
|
|
|
|
|
|
def __init__(self, baseuri=None, baselang=None, encoding=None, entities=None):
|
2023-04-13 07:04:58 +00:00
|
|
|
self.baseuri = baseuri or ""
|
2023-01-12 01:04:47 +00:00
|
|
|
self.lang = baselang or None
|
2023-04-13 07:04:58 +00:00
|
|
|
self.encoding = encoding or "utf-8" # character encoding
|
2023-01-12 01:04:47 +00:00
|
|
|
self.entities = entities or {}
|
2023-01-13 20:16:45 +00:00
|
|
|
super().__init__()
|
2023-01-12 01:04:47 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _normalize_attributes(kv):
|
|
|
|
k = kv[0].lower()
|
2023-04-13 07:04:58 +00:00
|
|
|
v = k in ("rel", "type") and kv[1].lower() or kv[1]
|
2023-01-12 01:04:47 +00:00
|
|
|
# the sgml parser doesn't handle entities in attributes, nor
|
|
|
|
# does it pass the attribute values through as unicode, while
|
|
|
|
# strict xml parsers do -- account for this difference
|
2023-04-13 07:04:58 +00:00
|
|
|
v = v.replace("&", "&")
|
2023-01-12 01:04:47 +00:00
|
|
|
return k, v
|
|
|
|
|
|
|
|
def decode_entities(self, element, data):
|
2023-04-13 07:04:58 +00:00
|
|
|
data = data.replace("<", "<")
|
|
|
|
data = data.replace("<", "<")
|
|
|
|
data = data.replace("<", "<")
|
|
|
|
data = data.replace(">", ">")
|
|
|
|
data = data.replace(">", ">")
|
|
|
|
data = data.replace(">", ">")
|
|
|
|
data = data.replace("&", "&")
|
|
|
|
data = data.replace("&", "&")
|
|
|
|
data = data.replace(""", """)
|
|
|
|
data = data.replace(""", """)
|
|
|
|
data = data.replace("'", "'")
|
|
|
|
data = data.replace("'", "'")
|
|
|
|
if not self.contentparams.get("type", "xml").endswith("xml"):
|
|
|
|
data = data.replace("<", "<")
|
|
|
|
data = data.replace(">", ">")
|
|
|
|
data = data.replace("&", "&")
|
|
|
|
data = data.replace(""", '"')
|
|
|
|
data = data.replace("'", "'")
|
|
|
|
data = data.replace("/", "/")
|
|
|
|
data = data.replace("/", "/")
|
2023-01-12 01:04:47 +00:00
|
|
|
return data
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def strattrs(attrs):
|
2023-04-13 07:04:58 +00:00
|
|
|
return "".join(' {}="{}"'.format(n, v.replace('"', """)) for n, v in attrs)
|