Merge branch 'feature/UpdateSoupsieve' into dev

This commit is contained in:
JackDandy 2023-02-09 14:42:00 +00:00
commit 062ccbeacc
8 changed files with 894 additions and 399 deletions

View file

@ -19,6 +19,7 @@
* Update UnRar x64 for Windows 6.11 to 6.20 * Update UnRar x64 for Windows 6.11 to 6.20
* Update Send2Trash 1.5.0 (66afce7) to 1.8.1b0 (0ef9b32) * Update Send2Trash 1.5.0 (66afce7) to 1.8.1b0 (0ef9b32)
* Update SimpleJSON 3.16.1 (ce75e60) to 3.18.1 (c891b95) * Update SimpleJSON 3.16.1 (ce75e60) to 3.18.1 (c891b95)
* Update soupsieve 2.0.2.dev (05086ef) to 2.3.2.post1 (792d566)
* Update tmdbsimple 2.6.6 (679e343) to 2.9.1 (9da400a) * Update tmdbsimple 2.6.6 (679e343) to 2.9.1 (9da400a)
* Update torrent_parser 0.3.0 (2a4eecb) to 0.4.0 (23b9e11) * Update torrent_parser 0.3.0 (2a4eecb) to 0.4.0 (23b9e11)
* Update unidecode module 1.1.1 (632af82) to 1.3.6 (4141992) * Update unidecode module 1.1.1 (632af82) to 1.3.6 (4141992)

View file

@ -25,11 +25,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE. SOFTWARE.
""" """
from __future__ import annotations
from .__meta__ import __version__, __version_info__ # noqa: F401 from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp from . import css_parser as cp
from . import css_match as cm from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401 from .util import DEBUG, SelectorSyntaxError # noqa: F401
import bs4 # type: ignore[import]
from typing import Optional, Any, Iterator, Iterable
__all__ = ( __all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve', 'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
@ -40,15 +43,18 @@ __all__ = (
SoupSieve = cm.SoupSieve SoupSieve = cm.SoupSieve
def compile(pattern, namespaces=None, flags=0, **kwargs): # noqa: A001 def compile( # noqa: A001
pattern: str,
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> cm.SoupSieve:
"""Compile CSS pattern.""" """Compile CSS pattern."""
if namespaces is not None: ns = ct.Namespaces(namespaces) if namespaces is not None else namespaces # type: Optional[ct.Namespaces]
namespaces = ct.Namespaces(**namespaces) cs = ct.CustomSelectors(custom) if custom is not None else custom # type: Optional[ct.CustomSelectors]
custom = kwargs.get('custom')
if custom is not None:
custom = ct.CustomSelectors(**custom)
if isinstance(pattern, SoupSieve): if isinstance(pattern, SoupSieve):
if flags: if flags:
@ -59,53 +65,103 @@ def compile(pattern, namespaces=None, flags=0, **kwargs): # noqa: A001
raise ValueError("Cannot process 'custom' argument on a compiled selector list") raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern return pattern
return cp._cached_css_compile(pattern, namespaces, custom, flags) return cp._cached_css_compile(pattern, ns, cs, flags)
def purge(): def purge() -> None:
"""Purge cached patterns.""" """Purge cached patterns."""
cp._purge_cache() cp._purge_cache()
def closest(select, tag, namespaces=None, flags=0, **kwargs): def closest(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Match closest ancestor.""" """Match closest ancestor."""
return compile(select, namespaces, flags, **kwargs).closest(tag) return compile(select, namespaces, flags, **kwargs).closest(tag)
def match(select, tag, namespaces=None, flags=0, **kwargs): def match(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> bool:
"""Match node.""" """Match node."""
return compile(select, namespaces, flags, **kwargs).match(tag) return compile(select, namespaces, flags, **kwargs).match(tag)
def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001 def filter( # noqa: A001
select: str,
iterable: Iterable['bs4.Tag'],
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list['bs4.Tag']:
"""Filter list of nodes.""" """Filter list of nodes."""
return compile(select, namespaces, flags, **kwargs).filter(iterable) return compile(select, namespaces, flags, **kwargs).filter(iterable)
def select_one(select, tag, namespaces=None, flags=0, **kwargs): def select_one(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> 'bs4.Tag':
"""Select a single tag.""" """Select a single tag."""
return compile(select, namespaces, flags, **kwargs).select_one(tag) return compile(select, namespaces, flags, **kwargs).select_one(tag)
def select(select, tag, namespaces=None, limit=0, flags=0, **kwargs): def select(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
limit: int = 0,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list['bs4.Tag']:
"""Select the specified tags.""" """Select the specified tags."""
return compile(select, namespaces, flags, **kwargs).select(tag, limit) return compile(select, namespaces, flags, **kwargs).select(tag, limit)
def iselect(select, tag, namespaces=None, limit=0, flags=0, **kwargs): def iselect(
select: str,
tag: 'bs4.Tag',
namespaces: Optional[dict[str, str]] = None,
limit: int = 0,
flags: int = 0,
*,
custom: Optional[dict[str, str]] = None,
**kwargs: Any
) -> Iterator['bs4.Tag']:
"""Iterate the specified tags.""" """Iterate the specified tags."""
for el in compile(select, namespaces, flags, **kwargs).iselect(tag, limit): for el in compile(select, namespaces, flags, **kwargs).iselect(tag, limit):
yield el yield el
def escape(ident): def escape(ident: str) -> str:
"""Escape identifier.""" """Escape identifier."""
return cp.escape(ident) return cp.escape(ident)

View file

@ -1,4 +1,5 @@
"""Meta related things.""" """Meta related things."""
from __future__ import annotations
from collections import namedtuple from collections import namedtuple
import re import re
@ -79,7 +80,11 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre"
""" """
def __new__(cls, major, minor, micro, release="final", pre=0, post=0, dev=0): def __new__(
cls,
major: int, minor: int, micro: int, release: str = "final",
pre: int = 0, post: int = 0, dev: int = 0
) -> Version:
"""Validate version info.""" """Validate version info."""
# Ensure all parts are positive integers. # Ensure all parts are positive integers.
@ -115,27 +120,27 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre"
return super(Version, cls).__new__(cls, major, minor, micro, release, pre, post, dev) return super(Version, cls).__new__(cls, major, minor, micro, release, pre, post, dev)
def _is_pre(self): def _is_pre(self) -> bool:
"""Is prerelease.""" """Is prerelease."""
return self.pre > 0 return bool(self.pre > 0)
def _is_dev(self): def _is_dev(self) -> bool:
"""Is development.""" """Is development."""
return bool(self.release < "alpha") return bool(self.release < "alpha")
def _is_post(self): def _is_post(self) -> bool:
"""Is post.""" """Is post."""
return self.post > 0 return bool(self.post > 0)
def _get_dev_status(self): # pragma: no cover def _get_dev_status(self) -> str: # pragma: no cover
"""Get development status string.""" """Get development status string."""
return DEV_STATUS[self.release] return DEV_STATUS[self.release]
def _get_canonical(self): def _get_canonical(self) -> str:
"""Get the canonical output string.""" """Get the canonical output string."""
# Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed.. # Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed..
@ -153,11 +158,14 @@ class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre"
return ver return ver
def parse_version(ver, pre=False): def parse_version(ver: str) -> Version:
"""Parse version into a comparable Version tuple.""" """Parse version into a comparable Version tuple."""
m = RE_VER.match(ver) m = RE_VER.match(ver)
if m is None:
raise ValueError("'{}' is not a valid version".format(ver))
# Handle major, minor, micro # Handle major, minor, micro
major = int(m.group('major')) major = int(m.group('major'))
minor = int(m.group('minor')) if m.group('minor') else 0 minor = int(m.group('minor')) if m.group('minor') else 0
@ -185,5 +193,5 @@ def parse_version(ver, pre=False):
return Version(major, minor, micro, release, pre, post, dev) return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(2, 0, 2, ".dev") __version_info__ = Version(2, 5, 0, "final", post=1)
__version__ = __version_info__._get_canonical() __version__ = __version_info__._get_canonical()

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,13 @@
"""CSS selector parser.""" """CSS selector parser."""
from __future__ import annotations
import re import re
from functools import lru_cache from functools import lru_cache
from . import util from . import util
from . import css_match as cm from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import SelectorSyntaxError from .util import SelectorSyntaxError
import warnings
from typing import Optional, Match, Any, Iterator, cast
UNICODE_REPLACEMENT_CHAR = 0xFFFD UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -59,6 +62,8 @@ PSEUDO_SIMPLE_NO_MATCH = {
# Complex pseudo classes that take selector lists # Complex pseudo classes that take selector lists
PSEUDO_COMPLEX = { PSEUDO_COMPLEX = {
':contains', ':contains',
':-soup-contains',
':-soup-contains-own',
':has', ':has',
':is', ':is',
':matches', ':matches',
@ -193,32 +198,42 @@ FLG_OPEN = 0x40
FLG_IN_RANGE = 0x80 FLG_IN_RANGE = 0x80
FLG_OUT_OF_RANGE = 0x100 FLG_OUT_OF_RANGE = 0x100
FLG_PLACEHOLDER_SHOWN = 0x200 FLG_PLACEHOLDER_SHOWN = 0x200
FLG_FORGIVE = 0x400
# Maximum cached patterns to store # Maximum cached patterns to store
_MAXCACHE = 500 _MAXCACHE = 500
@lru_cache(maxsize=_MAXCACHE) @lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile(pattern, namespaces, custom, flags): def _cached_css_compile(
pattern: str,
namespaces: Optional[ct.Namespaces],
custom: Optional[ct.CustomSelectors],
flags: int
) -> cm.SoupSieve:
"""Cached CSS compile.""" """Cached CSS compile."""
custom_selectors = process_custom(custom) custom_selectors = process_custom(custom)
return cm.SoupSieve( return cm.SoupSieve(
pattern, pattern,
CSSParser(pattern, custom=custom_selectors, flags=flags).process_selectors(), CSSParser(
pattern,
custom=custom_selectors,
flags=flags
).process_selectors(),
namespaces, namespaces,
custom, custom,
flags flags
) )
def _purge_cache(): def _purge_cache() -> None:
"""Purge the cache.""" """Purge the cache."""
_cached_css_compile.cache_clear() _cached_css_compile.cache_clear()
def process_custom(custom): def process_custom(custom: Optional[ct.CustomSelectors]) -> dict[str, str | ct.SelectorList]:
"""Process custom.""" """Process custom."""
custom_selectors = {} custom_selectors = {}
@ -233,14 +248,14 @@ def process_custom(custom):
return custom_selectors return custom_selectors
def css_unescape(content, string=False): def css_unescape(content: str, string: bool = False) -> str:
""" """
Unescape CSS value. Unescape CSS value.
Strings allow for spanning the value on multiple strings by escaping a new line. Strings allow for spanning the value on multiple strings by escaping a new line.
""" """
def replace(m): def replace(m: Match[str]) -> str:
"""Replace with the appropriate substitute.""" """Replace with the appropriate substitute."""
if m.group(1): if m.group(1):
@ -260,7 +275,7 @@ def css_unescape(content, string=False):
return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
def escape(ident): def escape(ident: str) -> str:
"""Escape identifier.""" """Escape identifier."""
string = [] string = []
@ -288,21 +303,21 @@ def escape(ident):
return ''.join(string) return ''.join(string)
class SelectorPattern(object): class SelectorPattern:
"""Selector pattern.""" """Selector pattern."""
def __init__(self, name, pattern): def __init__(self, name: str, pattern: str) -> None:
"""Initialize.""" """Initialize."""
self.name = name self.name = name
self.re_pattern = re.compile(pattern, re.I | re.X | re.U) self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
def get_name(self): def get_name(self) -> str:
"""Get name.""" """Get name."""
return self.name return self.name
def match(self, selector, index, flags): def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
"""Match the selector.""" """Match the selector."""
return self.re_pattern.match(selector, index) return self.re_pattern.match(selector, index)
@ -311,7 +326,7 @@ class SelectorPattern(object):
class SpecialPseudoPattern(SelectorPattern): class SpecialPseudoPattern(SelectorPattern):
"""Selector pattern.""" """Selector pattern."""
def __init__(self, patterns): def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None:
"""Initialize.""" """Initialize."""
self.patterns = {} self.patterns = {}
@ -321,15 +336,15 @@ class SpecialPseudoPattern(SelectorPattern):
for pseudo in p[1]: for pseudo in p[1]:
self.patterns[pseudo] = pattern self.patterns[pseudo] = pattern
self.matched_name = None self.matched_name = None # type: Optional[SelectorPattern]
self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
def get_name(self): def get_name(self) -> str:
"""Get name.""" """Get name."""
return self.matched_name.get_name() return '' if self.matched_name is None else self.matched_name.get_name()
def match(self, selector, index, flags): def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
"""Match the selector.""" """Match the selector."""
pseudo = None pseudo = None
@ -345,7 +360,7 @@ class SpecialPseudoPattern(SelectorPattern):
return pseudo return pseudo
class _Selector(object): class _Selector:
""" """
Intermediate selector class. Intermediate selector class.
@ -354,23 +369,23 @@ class _Selector(object):
the data in an object that can be pickled and hashed. the data in an object that can be pickled and hashed.
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs: Any) -> None:
"""Initialize.""" """Initialize."""
self.tag = kwargs.get('tag', None) self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
self.ids = kwargs.get('ids', []) self.ids = kwargs.get('ids', []) # type: list[str]
self.classes = kwargs.get('classes', []) self.classes = kwargs.get('classes', []) # type: list[str]
self.attributes = kwargs.get('attributes', []) self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute]
self.nth = kwargs.get('nth', []) self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth]
self.selectors = kwargs.get('selectors', []) self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList]
self.relations = kwargs.get('relations', []) self.relations = kwargs.get('relations', []) # type: list[_Selector]
self.rel_type = kwargs.get('rel_type', None) self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
self.contains = kwargs.get('contains', []) self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains]
self.lang = kwargs.get('lang', []) self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang]
self.flags = kwargs.get('flags', 0) self.flags = kwargs.get('flags', 0) # type: int
self.no_match = kwargs.get('no_match', False) self.no_match = kwargs.get('no_match', False) # type: bool
def _freeze_relations(self, relations): def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList:
"""Freeze relation.""" """Freeze relation."""
if relations: if relations:
@ -380,7 +395,7 @@ class _Selector(object):
else: else:
return ct.SelectorList() return ct.SelectorList()
def freeze(self): def freeze(self) -> ct.Selector | ct.SelectorNull:
"""Freeze self.""" """Freeze self."""
if self.no_match: if self.no_match:
@ -400,7 +415,7 @@ class _Selector(object):
self.flags self.flags
) )
def __str__(self): # pragma: no cover def __str__(self) -> str: # pragma: no cover
"""String representation.""" """String representation."""
return ( return (
@ -414,14 +429,19 @@ class _Selector(object):
__repr__ = __str__ __repr__ = __str__
class CSSParser(object): class CSSParser:
"""Parse CSS selectors.""" """Parse CSS selectors."""
css_tokens = ( css_tokens = (
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
SpecialPseudoPattern( SpecialPseudoPattern(
( (
("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS, SelectorPattern), (
"pseudo_contains",
(':contains', ':-soup-contains', ':-soup-contains-own'),
PAT_PSEUDO_CONTAINS,
SelectorPattern
),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern), ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern), ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern), ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
@ -439,7 +459,12 @@ class CSSParser(object):
SelectorPattern("combine", PAT_COMBINE) SelectorPattern("combine", PAT_COMBINE)
) )
def __init__(self, selector, custom=None, flags=0): def __init__(
self,
selector: str,
custom: Optional[dict[str, str | ct.SelectorList]] = None,
flags: int = 0
) -> None:
"""Initialize.""" """Initialize."""
self.pattern = selector.replace('\x00', '\ufffd') self.pattern = selector.replace('\x00', '\ufffd')
@ -447,7 +472,7 @@ class CSSParser(object):
self.debug = self.flags & util.DEBUG self.debug = self.flags & util.DEBUG
self.custom = {} if custom is None else custom self.custom = {} if custom is None else custom
def parse_attribute_selector(self, sel, m, has_selector): def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Create attribute selector from the returned regex match.""" """Create attribute selector from the returned regex match."""
inverse = False inverse = False
@ -457,22 +482,22 @@ class CSSParser(object):
attr = css_unescape(m.group('attr_name')) attr = css_unescape(m.group('attr_name'))
is_type = False is_type = False
pattern2 = None pattern2 = None
value = ''
if case: if case:
flags = re.I if case == 'i' else 0 flags = (re.I if case == 'i' else 0) | re.DOTALL
elif util.lower(attr) == 'type': elif util.lower(attr) == 'type':
flags = re.I flags = re.I | re.DOTALL
is_type = True is_type = True
else: else:
flags = 0 flags = re.DOTALL
if op: if op:
if m.group('value').startswith(('"', "'")): if m.group('value').startswith(('"', "'")):
value = css_unescape(m.group('value')[1:-1], True) value = css_unescape(m.group('value')[1:-1], True)
else: else:
value = css_unescape(m.group('value')) value = css_unescape(m.group('value'))
else:
value = None
if not op: if not op:
# Attribute name # Attribute name
pattern = None pattern = None
@ -517,7 +542,7 @@ class CSSParser(object):
has_selector = True has_selector = True
return has_selector return has_selector
def parse_tag_pattern(self, sel, m, has_selector): def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse tag pattern from regex match.""" """Parse tag pattern from regex match."""
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
@ -526,7 +551,7 @@ class CSSParser(object):
has_selector = True has_selector = True
return has_selector return has_selector
def parse_pseudo_class_custom(self, sel, m, has_selector): def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
""" """
Parse custom pseudo class alias. Parse custom pseudo class alias.
@ -538,13 +563,13 @@ class CSSParser(object):
selector = self.custom.get(pseudo) selector = self.custom.get(pseudo)
if selector is None: if selector is None:
raise SelectorSyntaxError( raise SelectorSyntaxError(
"Undefined custom selector '{}' found at postion {}".format(pseudo, m.end(0)), "Undefined custom selector '{}' found at position {}".format(pseudo, m.end(0)),
self.pattern, self.pattern,
m.end(0) m.end(0)
) )
if not isinstance(selector, ct.SelectorList): if not isinstance(selector, ct.SelectorList):
self.custom[pseudo] = None del self.custom[pseudo]
selector = CSSParser( selector = CSSParser(
selector, custom=self.custom, flags=self.flags selector, custom=self.custom, flags=self.flags
).process_selectors(flags=FLG_PSEUDO) ).process_selectors(flags=FLG_PSEUDO)
@ -554,7 +579,14 @@ class CSSParser(object):
has_selector = True has_selector = True
return has_selector return has_selector
def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html): def parse_pseudo_class(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
iselector: Iterator[tuple[str, Match[str]]],
is_html: bool
) -> tuple[bool, bool]:
"""Parse pseudo class.""" """Parse pseudo class."""
complex_pseudo = False complex_pseudo = False
@ -642,7 +674,13 @@ class CSSParser(object):
return has_selector, is_html return has_selector, is_html
def parse_pseudo_nth(self, sel, m, has_selector, iselector): def parse_pseudo_nth(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
iselector: Iterator[tuple[str, Match[str]]]
) -> bool:
"""Parse `nth` pseudo.""" """Parse `nth` pseudo."""
mdict = m.groupdict() mdict = m.groupdict()
@ -663,29 +701,29 @@ class CSSParser(object):
s2 = 1 s2 = 1
var = True var = True
else: else:
nth_parts = RE_NTH.match(content) nth_parts = cast(Match[str], RE_NTH.match(content))
s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else '' _s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
a = nth_parts.group('a') a = nth_parts.group('a')
var = a.endswith('n') var = a.endswith('n')
if a.startswith('n'): if a.startswith('n'):
s1 += '1' _s1 += '1'
elif var: elif var:
s1 += a[:-1] _s1 += a[:-1]
else: else:
s1 += a _s1 += a
s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else '' _s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
if nth_parts.group('b'): if nth_parts.group('b'):
s2 += nth_parts.group('b') _s2 += nth_parts.group('b')
else: else:
s2 = '0' _s2 = '0'
s1 = int(s1, 10) s1 = int(_s1, 10)
s2 = int(s2, 10) s2 = int(_s2, 10)
pseudo_sel = mdict['name'] pseudo_sel = mdict['name']
if postfix == '_child': if postfix == '_child':
if m.group('of'): if m.group('of'):
# Parse the rest of `of S`. # Parse the rest of `of S`.
nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN | FLG_FORGIVE)
else: else:
# Use default `*|*` for `of S`. # Use default `*|*` for `of S`.
nth_sel = CSS_NTH_OF_S_DEFAULT nth_sel = CSS_NTH_OF_S_DEFAULT
@ -701,20 +739,38 @@ class CSSParser(object):
has_selector = True has_selector = True
return has_selector return has_selector
def parse_pseudo_open(self, sel, name, has_selector, iselector, index): def parse_pseudo_open(
self,
sel: _Selector,
name: str,
has_selector: bool,
iselector: Iterator[tuple[str, Match[str]]],
index: int
) -> bool:
"""Parse pseudo with opening bracket.""" """Parse pseudo with opening bracket."""
flags = FLG_PSEUDO | FLG_OPEN flags = FLG_PSEUDO | FLG_OPEN
if name == ':not': if name == ':not':
flags |= FLG_NOT flags |= FLG_NOT
if name == ':has': elif name == ':has':
flags |= FLG_RELATIVE flags |= FLG_RELATIVE | FLG_FORGIVE
elif name in (':where', ':is'):
flags |= FLG_FORGIVE
sel.selectors.append(self.parse_selectors(iselector, index, flags)) sel.selectors.append(self.parse_selectors(iselector, index, flags))
has_selector = True has_selector = True
return has_selector return has_selector
def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index): def parse_has_combinator(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
selectors: list[_Selector],
rel_type: str,
index: int
) -> tuple[bool, _Selector, str]:
"""Parse combinator tokens.""" """Parse combinator tokens."""
combinator = m.group('relation').strip() combinator = m.group('relation').strip()
@ -723,12 +779,9 @@ class CSSParser(object):
if combinator == COMMA_COMBINATOR: if combinator == COMMA_COMBINATOR:
if not has_selector: if not has_selector:
# If we've not captured any selector parts, the comma is either at the beginning of the pattern # If we've not captured any selector parts, the comma is either at the beginning of the pattern
# or following another comma, both of which are unexpected. Commas must split selectors. # or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
raise SelectorSyntaxError( sel.no_match = True
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)
sel.rel_type = rel_type sel.rel_type = rel_type
selectors[-1].relations.append(sel) selectors[-1].relations.append(sel)
rel_type = ":" + WS_COMBINATOR rel_type = ":" + WS_COMBINATOR
@ -749,44 +802,63 @@ class CSSParser(object):
self.pattern, self.pattern,
index index
) )
# Set the leading combinator for the next selector. # Set the leading combinator for the next selector.
rel_type = ':' + combinator rel_type = ':' + combinator
sel = _Selector()
sel = _Selector()
has_selector = False has_selector = False
return has_selector, sel, rel_type return has_selector, sel, rel_type
def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index): def parse_combinator(
self,
sel: _Selector,
m: Match[str],
has_selector: bool,
selectors: list[_Selector],
relations: list[_Selector],
is_pseudo: bool,
is_forgive: bool,
index: int
) -> tuple[bool, _Selector]:
"""Parse combinator tokens.""" """Parse combinator tokens."""
combinator = m.group('relation').strip() combinator = m.group('relation').strip()
if not combinator: if not combinator:
combinator = WS_COMBINATOR combinator = WS_COMBINATOR
if not has_selector: if not has_selector:
raise SelectorSyntaxError( if not is_forgive or combinator != COMMA_COMBINATOR:
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), raise SelectorSyntaxError(
self.pattern, "The combinator '{}' at position {}, must have a selector before it".format(combinator, index),
index self.pattern,
) index
)
if combinator == COMMA_COMBINATOR: # If we are in a forgiving pseudo class, just make the selector a "no match"
if not sel.tag and not is_pseudo: if combinator == COMMA_COMBINATOR:
# Implied `*` sel.no_match = True
sel.tag = ct.SelectorTag('*', None) del relations[:]
sel.relations.extend(relations) selectors.append(sel)
selectors.append(sel)
del relations[:]
else: else:
sel.relations.extend(relations) if combinator == COMMA_COMBINATOR:
sel.rel_type = combinator if not sel.tag and not is_pseudo:
del relations[:] # Implied `*`
relations.append(sel) sel.tag = ct.SelectorTag('*', None)
sel = _Selector() sel.relations.extend(relations)
selectors.append(sel)
del relations[:]
else:
sel.relations.extend(relations)
sel.rel_type = combinator
del relations[:]
relations.append(sel)
sel = _Selector()
has_selector = False has_selector = False
return has_selector, sel return has_selector, sel
def parse_class_id(self, sel, m, has_selector): def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse HTML classes and ids.""" """Parse HTML classes and ids."""
selector = m.group(0) selector = m.group(0)
@ -797,10 +869,17 @@ class CSSParser(object):
has_selector = True has_selector = True
return has_selector return has_selector
def parse_pseudo_contains(self, sel, m, has_selector): def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse contains.""" """Parse contains."""
values = m.group('values') pseudo = util.lower(css_unescape(m.group('name')))
if pseudo == ":contains":
warnings.warn(
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
FutureWarning
)
contains_own = pseudo == ":-soup-contains-own"
values = css_unescape(m.group('values'))
patterns = [] patterns = []
for token in RE_VALUES.finditer(values): for token in RE_VALUES.finditer(values):
if token.group('split'): if token.group('split'):
@ -811,11 +890,11 @@ class CSSParser(object):
else: else:
value = css_unescape(value) value = css_unescape(value)
patterns.append(value) patterns.append(value)
sel.contains.append(ct.SelectorContains(tuple(patterns))) sel.contains.append(ct.SelectorContains(patterns, contains_own))
has_selector = True has_selector = True
return has_selector return has_selector
def parse_pseudo_lang(self, sel, m, has_selector): def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse pseudo language.""" """Parse pseudo language."""
values = m.group('values') values = m.group('values')
@ -836,7 +915,7 @@ class CSSParser(object):
return has_selector return has_selector
def parse_pseudo_dir(self, sel, m, has_selector): def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
"""Parse pseudo direction.""" """Parse pseudo direction."""
value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
@ -844,15 +923,23 @@ class CSSParser(object):
has_selector = True has_selector = True
return has_selector return has_selector
def parse_selectors(self, iselector, index=0, flags=0): def parse_selectors(
self,
iselector: Iterator[tuple[str, Match[str]]],
index: int = 0,
flags: int = 0
) -> ct.SelectorList:
"""Parse selectors.""" """Parse selectors."""
# Initialize important variables
sel = _Selector() sel = _Selector()
selectors = [] selectors = []
has_selector = False has_selector = False
closed = False closed = False
relations = [] relations = [] # type: list[_Selector]
rel_type = ":" + WS_COMBINATOR rel_type = ":" + WS_COMBINATOR
# Setup various flags
is_open = bool(flags & FLG_OPEN) is_open = bool(flags & FLG_OPEN)
is_pseudo = bool(flags & FLG_PSEUDO) is_pseudo = bool(flags & FLG_PSEUDO)
is_relative = bool(flags & FLG_RELATIVE) is_relative = bool(flags & FLG_RELATIVE)
@ -863,7 +950,9 @@ class CSSParser(object):
is_in_range = bool(flags & FLG_IN_RANGE) is_in_range = bool(flags & FLG_IN_RANGE)
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN) is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
is_forgive = bool(flags & FLG_FORGIVE)
# Print out useful debug stuff
if self.debug: # pragma: no cover if self.debug: # pragma: no cover
if is_pseudo: if is_pseudo:
print(' is_pseudo: True') print(' is_pseudo: True')
@ -885,7 +974,10 @@ class CSSParser(object):
print(' is_out_of_range: True') print(' is_out_of_range: True')
if is_placeholder_shown: if is_placeholder_shown:
print(' is_placeholder_shown: True') print(' is_placeholder_shown: True')
if is_forgive:
print(' is_forgive: True')
# The algorithm for relative selectors require an initial selector in the selector list
if is_relative: if is_relative:
selectors.append(_Selector()) selectors.append(_Selector())
@ -914,17 +1006,19 @@ class CSSParser(object):
is_html = True is_html = True
elif key == 'pseudo_close': elif key == 'pseudo_close':
if not has_selector: if not has_selector:
raise SelectorSyntaxError( if not is_forgive:
"Expected a selector at postion {}".format(m.start(0)), raise SelectorSyntaxError(
self.pattern, "Expected a selector at position {}".format(m.start(0)),
m.start(0) self.pattern,
) m.start(0)
)
sel.no_match = True
if is_open: if is_open:
closed = True closed = True
break break
else: else:
raise SelectorSyntaxError( raise SelectorSyntaxError(
"Unmatched pseudo-class close at postion {}".format(m.start(0)), "Unmatched pseudo-class close at position {}".format(m.start(0)),
self.pattern, self.pattern,
m.start(0) m.start(0)
) )
@ -935,7 +1029,7 @@ class CSSParser(object):
) )
else: else:
has_selector, sel = self.parse_combinator( has_selector, sel = self.parse_combinator(
sel, m, has_selector, selectors, relations, is_pseudo, index sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
) )
elif key == 'attribute': elif key == 'attribute':
has_selector = self.parse_attribute_selector(sel, m, has_selector) has_selector = self.parse_attribute_selector(sel, m, has_selector)
@ -954,6 +1048,7 @@ class CSSParser(object):
except StopIteration: except StopIteration:
pass pass
# Handle selectors that are not closed
if is_open and not closed: if is_open and not closed:
raise SelectorSyntaxError( raise SelectorSyntaxError(
"Unclosed pseudo-class at position {}".format(index), "Unclosed pseudo-class at position {}".format(index),
@ -961,6 +1056,7 @@ class CSSParser(object):
index index
) )
# Cleanup completed selector piece
if has_selector: if has_selector:
if not sel.tag and not is_pseudo: if not sel.tag and not is_pseudo:
# Implied `*` # Implied `*`
@ -972,8 +1068,28 @@ class CSSParser(object):
sel.relations.extend(relations) sel.relations.extend(relations)
del relations[:] del relations[:]
selectors.append(sel) selectors.append(sel)
else:
# Forgive empty slots in pseudo-classes that have lists (and are forgiving)
elif is_forgive:
if is_relative:
# Handle relative selectors pseudo-classes with empty slots like `:has()`
if selectors and selectors[-1].rel_type is None and rel_type == ': ':
sel.rel_type = rel_type
sel.no_match = True
selectors[-1].relations.append(sel)
has_selector = True
else:
# Handle normal pseudo-classes with empty slots
if not selectors or not relations:
# Others like `:is()` etc.
sel.no_match = True
del relations[:]
selectors.append(sel)
has_selector = True
if not has_selector:
# We will always need to finish a selector when `:has()` is used as it leads with combining. # We will always need to finish a selector when `:has()` is used as it leads with combining.
# May apply to others as well.
raise SelectorSyntaxError( raise SelectorSyntaxError(
'Expected a selector at position {}'.format(index), 'Expected a selector at position {}'.format(index),
self.pattern, self.pattern,
@ -994,9 +1110,10 @@ class CSSParser(object):
if is_placeholder_shown: if is_placeholder_shown:
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
# Return selector list
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
def selector_iter(self, pattern): def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]:
"""Iterate selector tokens.""" """Iterate selector tokens."""
# Ignore whitespace and comments at start and end of pattern # Ignore whitespace and comments at start and end of pattern
@ -1037,7 +1154,7 @@ class CSSParser(object):
if self.debug: # pragma: no cover if self.debug: # pragma: no cover
print('## END PARSING') print('## END PARSING')
def process_selectors(self, index=0, flags=0): def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList:
"""Process selectors.""" """Process selectors."""
return self.parse_selectors(self.selector_iter(self.pattern), index, flags) return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
@ -1048,7 +1165,7 @@ class CSSParser(object):
# CSS pattern for `:link` and `:any-link` # CSS pattern for `:link` and `:any-link`
CSS_LINK = CSSParser( CSS_LINK = CSSParser(
'html|*:is(a, area, link)[href]' 'html|*:is(a, area)[href]'
).process_selectors(flags=FLG_PSEUDO | FLG_HTML) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:checked` # CSS pattern for `:checked`
CSS_CHECKED = CSSParser( CSS_CHECKED = CSSParser(
@ -1079,23 +1196,23 @@ CSS_INDETERMINATE = CSSParser(
This pattern must be at the end. This pattern must be at the end.
Special logic is applied to the last selector. Special logic is applied to the last selector.
*/ */
html|input[type="radio"][name][name!='']:not([checked]) html|input[type="radio"][name]:not([name='']):not([checked])
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
# CSS pattern for `:disabled` # CSS pattern for `:disabled`
CSS_DISABLED = CSSParser( CSS_DISABLED = CSSParser(
''' '''
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|optgroup[disabled] > html|option, html|optgroup[disabled] > html|option,
html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset), html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
html|fieldset[disabled] > html|fieldset[disabled] >
html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset) html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:enabled` # CSS pattern for `:enabled`
CSS_ENABLED = CSSParser( CSS_ENABLED = CSSParser(
''' '''
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:required` # CSS pattern for `:required`
@ -1119,8 +1236,8 @@ CSS_PLACEHOLDER_SHOWN = CSSParser(
[type=email], [type=email],
[type=password], [type=password],
[type=number] [type=number]
)[placeholder][placeholder!='']:is(:not([value]), [value=""]), )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
html|textarea[placeholder][placeholder!=''] html|textarea[placeholder]:not([placeholder=''])
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
# CSS pattern default for `:nth-child` "of S" feature # CSS pattern default for `:nth-child` "of S" feature

View file

@ -1,6 +1,8 @@
"""CSS selector structure items.""" """CSS selector structure items."""
from __future__ import annotations
import copyreg import copyreg
from collections.abc import Hashable, Mapping from .pretty import pretty
from typing import Any, Iterator, Hashable, Optional, Pattern, Iterable, Mapping
__all__ = ( __all__ = (
'Selector', 'Selector',
@ -29,12 +31,14 @@ SEL_DEFINED = 0x200
SEL_PLACEHOLDER_SHOWN = 0x400 SEL_PLACEHOLDER_SHOWN = 0x400
class Immutable(object): class Immutable:
"""Immutable.""" """Immutable."""
__slots__ = ('_hash',) __slots__: tuple[str, ...] = ('_hash',)
def __init__(self, **kwargs): _hash: int
def __init__(self, **kwargs: Any) -> None:
"""Initialize.""" """Initialize."""
temp = [] temp = []
@ -45,12 +49,12 @@ class Immutable(object):
super(Immutable, self).__setattr__('_hash', hash(tuple(temp))) super(Immutable, self).__setattr__('_hash', hash(tuple(temp)))
@classmethod @classmethod
def __base__(cls): def __base__(cls) -> "type[Immutable]":
"""Get base class.""" """Get base class."""
return cls return cls
def __eq__(self, other): def __eq__(self, other: Any) -> bool:
"""Equal.""" """Equal."""
return ( return (
@ -58,7 +62,7 @@ class Immutable(object):
all([getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash']) all([getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash'])
) )
def __ne__(self, other): def __ne__(self, other: Any) -> bool:
"""Equal.""" """Equal."""
return ( return (
@ -66,63 +70,74 @@ class Immutable(object):
any([getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash']) any([getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash'])
) )
def __hash__(self): def __hash__(self) -> int:
"""Hash.""" """Hash."""
return self._hash return self._hash
def __setattr__(self, name, value): def __setattr__(self, name: str, value: Any) -> None:
"""Prevent mutability.""" """Prevent mutability."""
raise AttributeError("'{}' is immutable".format(self.__class__.__name__)) raise AttributeError("'{}' is immutable".format(self.__class__.__name__))
def __repr__(self): # pragma: no cover def __repr__(self) -> str: # pragma: no cover
"""Representation.""" """Representation."""
return "{}({})".format( return "{}({})".format(
self.__base__(), ', '.join(["{}={!r}".format(k, getattr(self, k)) for k in self.__slots__[:-1]]) self.__class__.__name__, ', '.join(["{}={!r}".format(k, getattr(self, k)) for k in self.__slots__[:-1]])
) )
__str__ = __repr__ __str__ = __repr__
def pretty(self) -> None: # pragma: no cover
"""Pretty print."""
class ImmutableDict(Mapping): print(pretty(self))
class ImmutableDict(Mapping[Any, Any]):
"""Hashable, immutable dictionary.""" """Hashable, immutable dictionary."""
def __init__(self, *args, **kwargs): def __init__(
self,
arg: dict[Any, Any] | Iterable[tuple[Any, Any]]
) -> None:
"""Initialize.""" """Initialize."""
arg = args[0] if args else kwargs self._validate(arg)
is_dict = isinstance(arg, dict) self._d = dict(arg)
if (
is_dict and not all([isinstance(v, Hashable) for v in arg.values()]) or
not is_dict and not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg])
):
raise TypeError('All values must be hashable')
self._d = dict(*args, **kwargs)
self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())])) self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())]))
def __iter__(self): def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
if not all([isinstance(v, Hashable) for v in arg.values()]):
raise TypeError('{} values must be hashable'.format(self.__class__.__name__))
elif not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg]):
raise TypeError('{} values must be hashable'.format(self.__class__.__name__))
def __iter__(self) -> Iterator[Any]:
"""Iterator.""" """Iterator."""
return iter(self._d) return iter(self._d)
def __len__(self): def __len__(self) -> int:
"""Length.""" """Length."""
return len(self._d) return len(self._d)
def __getitem__(self, key): def __getitem__(self, key: Any) -> Any:
"""Get item: `namespace['key']`.""" """Get item: `namespace['key']`."""
return self._d[key] return self._d[key]
def __hash__(self): def __hash__(self) -> int:
"""Hash.""" """Hash."""
return self._hash return self._hash
def __repr__(self): # pragma: no cover def __repr__(self) -> str: # pragma: no cover
"""Representation.""" """Representation."""
return "{!r}".format(self._d) return "{!r}".format(self._d)
@ -133,39 +148,37 @@ class ImmutableDict(Mapping):
class Namespaces(ImmutableDict): class Namespaces(ImmutableDict):
"""Namespaces.""" """Namespaces."""
def __init__(self, *args, **kwargs): def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize.""" """Initialize."""
# If there are arguments, check the first index. super().__init__(arg)
# `super` should fail if the user gave multiple arguments,
# so don't bother checking that.
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
raise TypeError('Namespace keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
raise TypeError('Namespace keys and values must be Unicode strings')
super(Namespaces, self).__init__(*args, **kwargs) def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
if not all([isinstance(v, str) for v in arg.values()]):
raise TypeError('{} values must be hashable'.format(self.__class__.__name__))
elif not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
raise TypeError('{} keys and values must be Unicode strings'.format(self.__class__.__name__))
class CustomSelectors(ImmutableDict): class CustomSelectors(ImmutableDict):
"""Custom selectors.""" """Custom selectors."""
def __init__(self, *args, **kwargs): def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Initialize.""" """Initialize."""
# If there are arguments, check the first index. super().__init__(arg)
# `super` should fail if the user gave multiple arguments,
# so don't bother checking that.
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
raise TypeError('CustomSelectors keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
raise TypeError('CustomSelectors keys and values must be Unicode strings')
super(CustomSelectors, self).__init__(*args, **kwargs) def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None:
"""Validate arguments."""
if isinstance(arg, dict):
if not all([isinstance(v, str) for v in arg.values()]):
raise TypeError('{} values must be hashable'.format(self.__class__.__name__))
elif not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
raise TypeError('{} keys and values must be Unicode strings'.format(self.__class__.__name__))
class Selector(Immutable): class Selector(Immutable):
@ -176,13 +189,35 @@ class Selector(Immutable):
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
) )
tag: Optional[SelectorTag]
ids: tuple[str, ...]
classes: tuple[str, ...]
attributes: tuple[SelectorAttribute, ...]
nth: tuple[SelectorNth, ...]
selectors: tuple[SelectorList, ...]
relation: SelectorList
rel_type: Optional[str]
contains: tuple[SelectorContains, ...]
lang: tuple[SelectorLang, ...]
flags: int
def __init__( def __init__(
self, tag, ids, classes, attributes, nth, selectors, self,
relation, rel_type, contains, lang, flags tag: Optional[SelectorTag],
ids: tuple[str, ...],
classes: tuple[str, ...],
attributes: tuple[SelectorAttribute, ...],
nth: tuple[SelectorNth, ...],
selectors: tuple[SelectorList, ...],
relation: SelectorList,
rel_type: Optional[str],
contains: tuple[SelectorContains, ...],
lang: tuple[SelectorLang, ...],
flags: int
): ):
"""Initialize.""" """Initialize."""
super(Selector, self).__init__( super().__init__(
tag=tag, tag=tag,
ids=ids, ids=ids,
classes=classes, classes=classes,
@ -200,10 +235,10 @@ class Selector(Immutable):
class SelectorNull(Immutable): class SelectorNull(Immutable):
"""Null Selector.""" """Null Selector."""
def __init__(self): def __init__(self) -> None:
"""Initialize.""" """Initialize."""
super(SelectorNull, self).__init__() super().__init__()
class SelectorTag(Immutable): class SelectorTag(Immutable):
@ -211,13 +246,13 @@ class SelectorTag(Immutable):
__slots__ = ("name", "prefix", "_hash") __slots__ = ("name", "prefix", "_hash")
def __init__(self, name, prefix): name: str
prefix: Optional[str]
def __init__(self, name: str, prefix: Optional[str]) -> None:
"""Initialize.""" """Initialize."""
super(SelectorTag, self).__init__( super().__init__(name=name, prefix=prefix)
name=name,
prefix=prefix
)
class SelectorAttribute(Immutable): class SelectorAttribute(Immutable):
@ -225,10 +260,21 @@ class SelectorAttribute(Immutable):
__slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash") __slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash")
def __init__(self, attribute, prefix, pattern, xml_type_pattern): attribute: str
prefix: str
pattern: Optional[Pattern[str]]
xml_type_pattern: Optional[Pattern[str]]
def __init__(
self,
attribute: str,
prefix: str,
pattern: Optional[Pattern[str]],
xml_type_pattern: Optional[Pattern[str]]
) -> None:
"""Initialize.""" """Initialize."""
super(SelectorAttribute, self).__init__( super().__init__(
attribute=attribute, attribute=attribute,
prefix=prefix, prefix=prefix,
pattern=pattern, pattern=pattern,
@ -239,14 +285,15 @@ class SelectorAttribute(Immutable):
class SelectorContains(Immutable): class SelectorContains(Immutable):
"""Selector contains rule.""" """Selector contains rule."""
__slots__ = ("text", "_hash") __slots__ = ("text", "own", "_hash")
def __init__(self, text): text: tuple[str, ...]
own: bool
def __init__(self, text: Iterable[str], own: bool) -> None:
"""Initialize.""" """Initialize."""
super(SelectorContains, self).__init__( super().__init__(text=tuple(text), own=own)
text=text
)
class SelectorNth(Immutable): class SelectorNth(Immutable):
@ -254,10 +301,17 @@ class SelectorNth(Immutable):
__slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash") __slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash")
def __init__(self, a, n, b, of_type, last, selectors): a: int
n: bool
b: int
of_type: bool
last: bool
selectors: SelectorList
def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None:
"""Initialize.""" """Initialize."""
super(SelectorNth, self).__init__( super().__init__(
a=a, a=a,
n=n, n=n,
b=b, b=b,
@ -272,24 +326,24 @@ class SelectorLang(Immutable):
__slots__ = ("languages", "_hash",) __slots__ = ("languages", "_hash",)
def __init__(self, languages): languages: tuple[str, ...]
def __init__(self, languages: Iterable[str]):
"""Initialize.""" """Initialize."""
super(SelectorLang, self).__init__( super().__init__(languages=tuple(languages))
languages=tuple(languages)
)
def __iter__(self): def __iter__(self) -> Iterator[str]:
"""Iterator.""" """Iterator."""
return iter(self.languages) return iter(self.languages)
def __len__(self): # pragma: no cover def __len__(self) -> int: # pragma: no cover
"""Length.""" """Length."""
return len(self.languages) return len(self.languages)
def __getitem__(self, index): # pragma: no cover def __getitem__(self, index: int) -> str: # pragma: no cover
"""Get item.""" """Get item."""
return self.languages[index] return self.languages[index]
@ -300,36 +354,45 @@ class SelectorList(Immutable):
__slots__ = ("selectors", "is_not", "is_html", "_hash") __slots__ = ("selectors", "is_not", "is_html", "_hash")
def __init__(self, selectors=tuple(), is_not=False, is_html=False): selectors: tuple[Selector | SelectorNull, ...]
is_not: bool
is_html: bool
def __init__(
self,
selectors: Optional[Iterable[Selector | SelectorNull]] = None,
is_not: bool = False,
is_html: bool = False
) -> None:
"""Initialize.""" """Initialize."""
super(SelectorList, self).__init__( super().__init__(
selectors=tuple(selectors), selectors=tuple(selectors) if selectors is not None else tuple(),
is_not=is_not, is_not=is_not,
is_html=is_html is_html=is_html
) )
def __iter__(self): def __iter__(self) -> Iterator[Selector | SelectorNull]:
"""Iterator.""" """Iterator."""
return iter(self.selectors) return iter(self.selectors)
def __len__(self): def __len__(self) -> int:
"""Length.""" """Length."""
return len(self.selectors) return len(self.selectors)
def __getitem__(self, index): def __getitem__(self, index: int) -> Selector | SelectorNull:
"""Get item.""" """Get item."""
return self.selectors[index] return self.selectors[index]
def _pickle(p): def _pickle(p: Any) -> Any:
return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]]) return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]])
def pickle_register(obj): def pickle_register(obj: Any) -> None:
"""Allow object to be pickled.""" """Allow object to be pickled."""
copyreg.pickle(obj, _pickle) copyreg.pickle(obj, _pickle)

138
lib/soupsieve/pretty.py Normal file
View file

@ -0,0 +1,138 @@
"""
Format a pretty string of a `SoupSieve` object for easy debugging.
This won't necessarily support all types and such, and definitely
not support custom outputs.
It is mainly geared towards our types as the `SelectorList`
object is a beast to look at without some indentation and newlines.
The format and various output types is fairly known (though it
hasn't been tested extensively to make sure we aren't missing corners).
Example:
```
>>> import soupsieve as sv
>>> sv.compile('this > that.class[name=value]').selectors.pretty()
SelectorList(
selectors=(
Selector(
tag=SelectorTag(
name='that',
prefix=None),
ids=(),
classes=(
'class',
),
attributes=(
SelectorAttribute(
attribute='name',
prefix='',
pattern=re.compile(
'^value$'),
xml_type_pattern=None),
),
nth=(),
selectors=(),
relation=SelectorList(
selectors=(
Selector(
tag=SelectorTag(
name='this',
prefix=None),
ids=(),
classes=(),
attributes=(),
nth=(),
selectors=(),
relation=SelectorList(
selectors=(),
is_not=False,
is_html=False),
rel_type='>',
contains=(),
lang=(),
flags=0),
),
is_not=False,
is_html=False),
rel_type=None,
contains=(),
lang=(),
flags=0),
),
is_not=False,
is_html=False)
```
"""
from __future__ import annotations
import re
from typing import Any
RE_CLASS = re.compile(r'(?i)[a-z_][_a-z\d\.]+\(')
RE_PARAM = re.compile(r'(?i)[_a-z][_a-z\d]+=')
RE_EMPTY = re.compile(r'\(\)|\[\]|\{\}')
RE_LSTRT = re.compile(r'\[')
RE_DSTRT = re.compile(r'\{')
RE_TSTRT = re.compile(r'\(')
RE_LEND = re.compile(r'\]')
RE_DEND = re.compile(r'\}')
RE_TEND = re.compile(r'\)')
RE_INT = re.compile(r'\d+')
RE_KWORD = re.compile(r'(?i)[_a-z][_a-z\d]+')
RE_DQSTR = re.compile(r'"(?:\\.|[^"\\])*"')
RE_SQSTR = re.compile(r"'(?:\\.|[^'\\])*'")
RE_SEP = re.compile(r'\s*(,)\s*')
RE_DSEP = re.compile(r'\s*(:)\s*')
TOKENS = {
'class': RE_CLASS,
'param': RE_PARAM,
'empty': RE_EMPTY,
'lstrt': RE_LSTRT,
'dstrt': RE_DSTRT,
'tstrt': RE_TSTRT,
'lend': RE_LEND,
'dend': RE_DEND,
'tend': RE_TEND,
'sqstr': RE_SQSTR,
'sep': RE_SEP,
'dsep': RE_DSEP,
'int': RE_INT,
'kword': RE_KWORD,
'dqstr': RE_DQSTR
}
def pretty(obj: Any) -> str: # pragma: no cover
"""Make the object output string pretty."""
sel = str(obj)
index = 0
end = len(sel) - 1
indent = 0
output = []
while index <= end:
m = None
for k, v in TOKENS.items():
m = v.match(sel, index)
if m:
name = k
index = m.end(0)
if name in ('class', 'lstrt', 'dstrt', 'tstrt'):
indent += 4
output.append('{}\n{}'.format(m.group(0), " " * indent))
elif name in ('param', 'int', 'kword', 'sqstr', 'dqstr', 'empty'):
output.append(m.group(0))
elif name in ('lend', 'dend', 'tend'):
indent -= 4
output.append(m.group(0))
elif name in ('sep',):
output.append('{}\n{}'.format(m.group(1), " " * indent))
elif name in ('dsep',):
output.append('{} '.format(m.group(1)))
break
return ''.join(output)

View file

@ -1,7 +1,9 @@
"""Utility.""" """Utility."""
from __future__ import annotations
from functools import wraps, lru_cache from functools import wraps, lru_cache
import warnings import warnings
import re import re
from typing import Callable, Any, Optional
DEBUG = 0x00001 DEBUG = 0x00001
@ -12,7 +14,7 @@ UC_Z = ord('Z')
@lru_cache(maxsize=512) @lru_cache(maxsize=512)
def lower(string): def lower(string: str) -> str:
"""Lower.""" """Lower."""
new_string = [] new_string = []
@ -25,7 +27,7 @@ def lower(string):
class SelectorSyntaxError(Exception): class SelectorSyntaxError(Exception):
"""Syntax error in a CSS selector.""" """Syntax error in a CSS selector."""
def __init__(self, msg, pattern=None, index=None): def __init__(self, msg: str, pattern: Optional[str] = None, index: Optional[int] = None) -> None:
"""Initialize.""" """Initialize."""
self.line = None self.line = None
@ -37,30 +39,34 @@ class SelectorSyntaxError(Exception):
self.context, self.line, self.col = get_pattern_context(pattern, index) self.context, self.line, self.col = get_pattern_context(pattern, index)
msg = '{}\n line {}:\n{}'.format(msg, self.line, self.context) msg = '{}\n line {}:\n{}'.format(msg, self.line, self.context)
super(SelectorSyntaxError, self).__init__(msg) super().__init__(msg)
def deprecated(message, stacklevel=2): # pragma: no cover def deprecated(message: str, stacklevel: int = 2) -> Callable[..., Any]: # pragma: no cover
""" """
Raise a `DeprecationWarning` when wrapped function/method is called. Raise a `DeprecationWarning` when wrapped function/method is called.
Borrowed from https://stackoverflow.com/a/48632082/866026 Usage:
@deprecated("This method will be removed in version X; use Y instead.")
def some_method()"
pass
""" """
def _decorator(func): def _wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
@wraps(func) @wraps(func)
def _func(*args, **kwargs): def _deprecated_func(*args: Any, **kwargs: Any) -> Any:
warnings.warn( warnings.warn(
"'{}' is deprecated. {}".format(func.__name__, message), f"'{func.__name__}' is deprecated. {message}",
category=DeprecationWarning, category=DeprecationWarning,
stacklevel=stacklevel stacklevel=stacklevel
) )
return func(*args, **kwargs) return func(*args, **kwargs)
return _func return _deprecated_func
return _decorator return _wrapper
def warn_deprecated(message, stacklevel=2): # pragma: no cover def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no cover
"""Warn deprecated.""" """Warn deprecated."""
warnings.warn( warnings.warn(
@ -70,14 +76,15 @@ def warn_deprecated(message, stacklevel=2): # pragma: no cover
) )
def get_pattern_context(pattern, index): def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]:
"""Get the pattern context.""" """Get the pattern context."""
last = 0 last = 0
current_line = 1 current_line = 1
col = 1 col = 1
text = [] text = [] # type: list[str]
line = 1 line = 1
offset = None # type: Optional[int]
# Split pattern by newline and handle the text before the newline # Split pattern by newline and handle the text before the newline
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern): for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):