"""CSS matcher.""" from __future__ import annotations from datetime import datetime from . import util import re from . import css_types as ct import unicodedata import bs4 # type: ignore[import] from typing import Iterator, Iterable, Any, Optional, Callable, Sequence, cast # noqa: F401 # Empty tag pattern (whitespace okay) RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') RE_NOT_WS = re.compile('[^ \t\r\n\f]+') # Relationships REL_PARENT = ' ' REL_CLOSE_PARENT = '>' REL_SIBLING = '~' REL_CLOSE_SIBLING = '+' # Relationships for :has() (forward looking) REL_HAS_PARENT = ': ' REL_HAS_CLOSE_PARENT = ':>' REL_HAS_SIBLING = ':~' REL_HAS_CLOSE_SIBLING = ':+' NS_XHTML = 'http://www.w3.org/1999/xhtml' NS_XML = 'http://www.w3.org/XML/1998/namespace' DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE DIR_MAP = { 'ltr': ct.SEL_DIR_LTR, 'rtl': ct.SEL_DIR_RTL, 'auto': 0 } RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') RE_DATETIME = re.compile( r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' ) RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November FEB = 2 SHORT_MONTH = 30 LONG_MONTH = 31 FEB_MONTH = 28 FEB_LEAP_MONTH = 29 DAYS_IN_WEEK = 7 class _FakeParent: """ Fake parent class. When we have a fragment with no `BeautifulSoup` document object, we can't evaluate `nth` selectors properly. Create a temporary fake parent so we can traverse the root element as a child. """ def __init__(self, element: bs4.Tag) -> None: """Initialize.""" self.contents = [element] def __len__(self) -> bs4.PageElement: """Length.""" return len(self.contents) class _DocumentNav: """Navigate a Beautiful Soup document.""" @classmethod def assert_valid_input(cls, tag: Any) -> None: """Check if valid input tag or document.""" # Fail on unexpected types. if not cls.is_tag(tag): raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag))) @staticmethod def is_doc(obj: bs4.Tag) -> bool: """Is `BeautifulSoup` object.""" return isinstance(obj, bs4.BeautifulSoup) @staticmethod def is_tag(obj: bs4.PageElement) -> bool: """Is tag.""" return isinstance(obj, bs4.Tag) @staticmethod def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover """Is declaration.""" return isinstance(obj, bs4.Declaration) @staticmethod def is_cdata(obj: bs4.PageElement) -> bool: """Is CDATA.""" return isinstance(obj, bs4.CData) @staticmethod def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover """Is processing instruction.""" return isinstance(obj, bs4.ProcessingInstruction) @staticmethod def is_navigable_string(obj: bs4.PageElement) -> bool: """Is navigable string.""" return isinstance(obj, bs4.NavigableString) @staticmethod def is_special_string(obj: bs4.PageElement) -> bool: """Is special string.""" return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) @classmethod def is_content_string(cls, obj: bs4.PageElement) -> bool: """Check if node is content string.""" return cls.is_navigable_string(obj) and not cls.is_special_string(obj) @staticmethod def create_fake_parent(el: bs4.Tag) -> _FakeParent: """Create fake parent for a given element.""" return _FakeParent(el) @staticmethod def is_xml_tree(el: bs4.Tag) -> bool: """Check if element (or document) is from a XML tree.""" return bool(el._is_xml) def is_iframe(self, el: bs4.Tag) -> bool: """Check if element is an `iframe`.""" return bool( ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) # type: ignore[attr-defined] ) def is_root(self, el: bs4.Tag) -> bool: """ Return whether element is a root element. We check that the element is the root of the tree (which we have already pre-calculated), and we check if it is the root element under an `iframe`. """ root = self.root and self.root is el # type: ignore[attr-defined] if not root: parent = self.get_parent(el) root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] return root def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]: """Get contents or contents in reverse.""" if not no_iframe or not self.is_iframe(el): for content in el.contents: yield content def get_children( self, el: bs4.Tag, start: Optional[int] = None, reverse: bool = False, tags: bool = True, no_iframe: bool = False ) -> Iterator[bs4.PageElement]: """Get children.""" if not no_iframe or not self.is_iframe(el): last = len(el.contents) - 1 if start is None: index = last if reverse else 0 else: index = start end = -1 if reverse else last + 1 incr = -1 if reverse else 1 if 0 <= index <= last: while index != end: node = el.contents[index] index += incr if not tags or self.is_tag(node): yield node def get_descendants( self, el: bs4.Tag, tags: bool = True, no_iframe: bool = False ) -> Iterator[bs4.PageElement]: """Get descendants.""" if not no_iframe or not self.is_iframe(el): next_good = None for child in el.descendants: if next_good is not None: if child is not next_good: continue next_good = None is_tag = self.is_tag(child) if no_iframe and is_tag and self.is_iframe(child): if child.next_sibling is not None: next_good = child.next_sibling else: last_child = child while self.is_tag(last_child) and last_child.contents: last_child = last_child.contents[-1] next_good = last_child.next_element yield child if next_good is None: break # Coverage isn't seeing this even though it's executed continue # pragma: no cover if not tags or is_tag: yield child def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag: """Get parent.""" parent = el.parent if no_iframe and parent is not None and self.is_iframe(parent): parent = None return parent @staticmethod def get_tag_name(el: bs4.Tag) -> Optional[str]: """Get tag.""" return cast(Optional[str], el.name) @staticmethod def get_prefix_name(el: bs4.Tag) -> Optional[str]: """Get prefix.""" return cast(Optional[str], el.prefix) @staticmethod def get_uri(el: bs4.Tag) -> Optional[str]: """Get namespace `URI`.""" return cast(Optional[str], el.namespace) @classmethod def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: """Get next sibling tag.""" sibling = el.next_sibling while tags and not cls.is_tag(sibling) and sibling is not None: sibling = sibling.next_sibling return sibling @classmethod def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: """Get previous sibling tag.""" sibling = el.previous_sibling while tags and not cls.is_tag(sibling) and sibling is not None: sibling = sibling.previous_sibling return sibling @staticmethod def has_html_ns(el: bs4.Tag) -> bool: """ Check if element has an HTML namespace. This is a bit different than whether a element is treated as having an HTML namespace, like we do in the case of `is_html_tag`. """ ns = getattr(el, 'namespace') if el else None return bool(ns and ns == NS_XHTML) @staticmethod def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[Optional[str], Optional[str]]: """Return namespace and attribute name without the prefix.""" return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) @classmethod def normalize_value(cls, value: Any) -> str | Sequence[str]: """Normalize the value to be a string or list of strings.""" # Treat `None` as empty string. if value is None: return '' # Pass through strings if (isinstance(value, str)): return value # If it's a byte string, convert it to Unicode, treating it as UTF-8. if isinstance(value, bytes): return value.decode("utf8") # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. if isinstance(value, Sequence): new_value = [] for v in value: if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): # This is most certainly a user error and will crash and burn later. # To keep things working, we'll do what we do with all objects, # And convert them to strings. new_value.append(str(v)) else: # Convert the child to a string new_value.append(cast(str, cls.normalize_value(v))) return new_value # Try and make anything else a string return str(value) @classmethod def get_attribute_by_name( cls, el: bs4.Tag, name: str, default: Optional[str | Sequence[str]] = None ) -> Optional[str | Sequence[str]]: """Get attribute by name.""" value = default if el._is_xml: try: value = cls.normalize_value(el.attrs[name]) except KeyError: pass else: for k, v in el.attrs.items(): if util.lower(k) == name: value = cls.normalize_value(v) break return value @classmethod def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, Optional[str | Sequence[str]]]]: """Iterate attributes.""" for k, v in el.attrs.items(): yield k, cls.normalize_value(v) @classmethod def get_classes(cls, el: bs4.Tag) -> Sequence[str]: """Get classes.""" classes = cls.get_attribute_by_name(el, 'class', []) if isinstance(classes, str): classes = RE_NOT_WS.findall(classes) return cast(Sequence[str], classes) def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: """Get text.""" return ''.join( [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] ) def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: """Get Own Text.""" return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] class Inputs: """Class for parsing and validating input items.""" @staticmethod def validate_day(year: int, month: int, day: int) -> bool: """Validate day.""" max_days = LONG_MONTH if month == FEB: max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH elif month in MONTHS_30: max_days = SHORT_MONTH return 1 <= day <= max_days @staticmethod def validate_week(year: int, week: int) -> bool: """Validate week.""" max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1] if max_week == 1: max_week = 53 return 1 <= week <= max_week @staticmethod def validate_month(month: int) -> bool: """Validate month.""" return 1 <= month <= 12 @staticmethod def validate_year(year: int) -> bool: """Validate year.""" return 1 <= year @staticmethod def validate_hour(hour: int) -> bool: """Validate hour.""" return 0 <= hour <= 23 @staticmethod def validate_minutes(minutes: int) -> bool: """Validate minutes.""" return 0 <= minutes <= 59 @classmethod def parse_value(cls, itype: str, value: Optional[str]) -> Optional[tuple[float, ...]]: """Parse the input value.""" parsed = None # type: Optional[tuple[float, ...]] if value is None: return value if itype == "date": m = RE_DATE.match(value) if m: year = int(m.group('year'), 10) month = int(m.group('month'), 10) day = int(m.group('day'), 10) if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): parsed = (year, month, day) elif itype == "month": m = RE_MONTH.match(value) if m: year = int(m.group('year'), 10) month = int(m.group('month'), 10) if cls.validate_year(year) and cls.validate_month(month): parsed = (year, month) elif itype == "week": m = RE_WEEK.match(value) if m: year = int(m.group('year'), 10) week = int(m.group('week'), 10) if cls.validate_year(year) and cls.validate_week(year, week): parsed = (year, week) elif itype == "time": m = RE_TIME.match(value) if m: hour = int(m.group('hour'), 10) minutes = int(m.group('minutes'), 10) if cls.validate_hour(hour) and cls.validate_minutes(minutes): parsed = (hour, minutes) elif itype == "datetime-local": m = RE_DATETIME.match(value) if m: year = int(m.group('year'), 10) month = int(m.group('month'), 10) day = int(m.group('day'), 10) hour = int(m.group('hour'), 10) minutes = int(m.group('minutes'), 10) if ( cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and cls.validate_hour(hour) and cls.validate_minutes(minutes) ): parsed = (year, month, day, hour, minutes) elif itype in ("number", "range"): m = RE_NUM.match(value) if m: parsed = (float(m.group('value')),) return parsed class CSSMatch(_DocumentNav): """Perform CSS matching.""" def __init__( self, selectors: ct.SelectorList, scope: bs4.Tag, namespaces: Optional[ct.Namespaces], flags: int ) -> None: """Initialize.""" self.assert_valid_input(scope) self.tag = scope self.cached_meta_lang = [] # type: list[tuple[str, str]] self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] self.selectors = selectors self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] self.flags = flags self.iframe_restrict = False # Find the root element for the whole tree doc = scope parent = self.get_parent(doc) while parent: doc = parent parent = self.get_parent(doc) root = None if not self.is_doc(doc): root = doc else: for child in self.get_children(doc): root = child break self.root = root self.scope = scope if scope is not doc else root self.has_html_namespace = self.has_html_ns(root) # A document can be both XML and HTML (XHTML) self.is_xml = self.is_xml_tree(doc) self.is_html = not self.is_xml or self.has_html_namespace def supports_namespaces(self) -> bool: """Check if namespaces are supported in the HTML type.""" return self.is_xml or self.has_html_namespace def get_tag_ns(self, el: bs4.Tag) -> str: """Get tag namespace.""" if self.supports_namespaces(): namespace = '' ns = self.get_uri(el) if ns: namespace = ns else: namespace = NS_XHTML return namespace def is_html_tag(self, el: bs4.Tag) -> bool: """Check if tag is in HTML namespace.""" return self.get_tag_ns(el) == NS_XHTML def get_tag(self, el: bs4.Tag) -> Optional[str]: """Get tag.""" name = self.get_tag_name(el) return util.lower(name) if name is not None and not self.is_xml else name def get_prefix(self, el: bs4.Tag) -> Optional[str]: """Get prefix.""" prefix = self.get_prefix_name(el) return util.lower(prefix) if prefix is not None and not self.is_xml else prefix def find_bidi(self, el: bs4.Tag) -> Optional[int]: """Get directionality from element text.""" for node in self.get_children(el, tags=False): # Analyze child text nodes if self.is_tag(node): # Avoid analyzing certain elements specified in the specification. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) if ( self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or not self.is_html_tag(node) or direction is not None ): continue # pragma: no cover # Check directionality of this node's text value = self.find_bidi(node) if value is not None: return value # Direction could not be determined continue # pragma: no cover # Skip `doctype` comments, etc. if self.is_special_string(node): continue # Analyze text nodes for directionality. for c in node: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return None def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: """Filter the language tags.""" match = True lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() ranges = lang_range.split('-') subtags = lang_tag.lower().split('-') length = len(ranges) slength = len(subtags) rindex = 0 sindex = 0 r = ranges[rindex] s = subtags[sindex] # Empty specified language should match unspecified language attributes if length == 1 and slength == 1 and not r and r == s: return True # Primary tag needs to match if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): match = False rindex += 1 sindex += 1 # Match until we run out of ranges while match and rindex < length: r = ranges[rindex] try: s = subtags[sindex] except IndexError: # Ran out of subtags, # but we still have ranges match = False continue # Empty range if not r: match = False continue # Matched range elif s == r: rindex += 1 # Implicit wildcard cannot match # singletons elif len(s) == 1: match = False continue # Implicitly matched, so grab next subtag sindex += 1 return match def match_attribute_name( self, el: bs4.Tag, attr: str, prefix: Optional[str] ) -> Optional[str | Sequence[str]]: """Match attribute name and return value if it exists.""" value = None if self.supports_namespaces(): value = None # If we have not defined namespaces, we can't very well find them, so don't bother trying. if prefix: ns = self.namespaces.get(prefix) if ns is None and prefix != '*': return None else: ns = None for k, v in self.iter_attributes(el): # Get attribute parts namespace, name = self.split_namespace(el, k) # Can't match a prefix attribute as we haven't specified one to match # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. if ns is None: if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): value = v break # Coverage is not finding this even though it is executed. # Adding a print statement before this (and erasing coverage) causes coverage to find the line. # Ignore the false positive message. continue # pragma: no cover # We can't match our desired prefix attribute as the attribute doesn't have a prefix if namespace is None or ns != namespace and prefix != '*': continue # The attribute doesn't match. if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): continue value = v break else: for k, v in self.iter_attributes(el): if util.lower(attr) != util.lower(k): continue value = v break return value def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: """Match the namespace of the element.""" match = True namespace = self.get_tag_ns(el) default_namespace = self.namespaces.get('') tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) # We must match the default namespace if one is not provided if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): match = False # If we specified `|tag`, we must not have a namespace. elif (tag.prefix is not None and tag.prefix == '' and namespace): match = False # Verify prefix matches elif ( tag.prefix and tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) ): match = False return match def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: """Match attributes.""" match = True if attributes: for a in attributes: temp = self.match_attribute_name(el, a.attribute, a.prefix) pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern if temp is None: match = False break value = temp if isinstance(temp, str) else ' '.join(temp) if pattern is None: continue elif pattern.match(value) is None: match = False break return match def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: """Match tag name.""" name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) return not ( name is not None and name not in (self.get_tag(el), '*') ) def match_tag(self, el: bs4.Tag, tag: Optional[ct.SelectorTag]) -> bool: """Match the tag.""" match = True if tag is not None: # Verify namespace if not self.match_namespace(el, tag): match = False if not self.match_tagname(el, tag): match = False return match def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match past relationship.""" found = False # I don't think this can ever happen, but it makes `mypy` happy if isinstance(relation[0], ct.SelectorNull): # pragma: no cover return found if relation[0].rel_type == REL_PARENT: parent = self.get_parent(el, no_iframe=self.iframe_restrict) while not found and parent: found = self.match_selectors(parent, relation) parent = self.get_parent(parent, no_iframe=self.iframe_restrict) elif relation[0].rel_type == REL_CLOSE_PARENT: parent = self.get_parent(el, no_iframe=self.iframe_restrict) if parent: found = self.match_selectors(parent, relation) elif relation[0].rel_type == REL_SIBLING: sibling = self.get_previous(el) while not found and sibling: found = self.match_selectors(sibling, relation) sibling = self.get_previous(sibling) elif relation[0].rel_type == REL_CLOSE_SIBLING: sibling = self.get_previous(el) if sibling and self.is_tag(sibling): found = self.match_selectors(sibling, relation) return found def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: """Match future child.""" match = False if recursive: children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]] else: children = self.get_children for child in children(parent, no_iframe=self.iframe_restrict): match = self.match_selectors(child, relation) if match: break return match def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match future relationship.""" found = False # I don't think this can ever happen, but it makes `mypy` happy if isinstance(relation[0], ct.SelectorNull): # pragma: no cover return found if relation[0].rel_type == REL_HAS_PARENT: found = self.match_future_child(el, relation, True) elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: found = self.match_future_child(el, relation) elif relation[0].rel_type == REL_HAS_SIBLING: sibling = self.get_next(el) while not found and sibling: found = self.match_selectors(sibling, relation) sibling = self.get_next(sibling) elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: sibling = self.get_next(el) if sibling and self.is_tag(sibling): found = self.match_selectors(sibling, relation) return found def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match relationship to other elements.""" found = False if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: return found if relation[0].rel_type.startswith(':'): found = self.match_future_relations(el, relation) else: found = self.match_past_relations(el, relation) return found def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: """Match element's ID.""" found = True for i in ids: if i != self.get_attribute_by_name(el, 'id', ''): found = False break return found def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: """Match element's classes.""" current_classes = self.get_classes(el) found = True for c in classes: if c not in current_classes: found = False break return found def match_root(self, el: bs4.Tag) -> bool: """Match element as root.""" is_root = self.is_root(el) if is_root: sibling = self.get_previous(el, tags=False) while is_root and sibling is not None: if ( self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or self.is_cdata(sibling) ): is_root = False else: sibling = self.get_previous(sibling, tags=False) if is_root: sibling = self.get_next(el, tags=False) while is_root and sibling is not None: if ( self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or self.is_cdata(sibling) ): is_root = False else: sibling = self.get_next(sibling, tags=False) return is_root def match_scope(self, el: bs4.Tag) -> bool: """Match element as scope.""" return self.scope is el def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: """Match tag type for `nth` matches.""" return ( (self.get_tag(child) == self.get_tag(el)) and (self.get_tag_ns(child) == self.get_tag_ns(el)) ) def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool: """Match `nth` elements.""" matched = True for n in nth: matched = False if n.selectors and not self.match_selectors(el, n.selectors): break parent = self.get_parent(el) if parent is None: parent = self.create_fake_parent(el) last = n.last last_index = len(parent) - 1 index = last_index if last else 0 relative_index = 0 a = n.a b = n.b var = n.n count = 0 count_incr = 1 factor = -1 if last else 1 idx = last_idx = a * count + b if var else a # We can only adjust bounds within a variable index if var: # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. # Otherwise, increment to try to get in bounds. adjust = None while idx < 1 or idx > last_index: if idx < 0: diff_low = 0 - idx if adjust is not None and adjust == 1: break adjust = -1 count += count_incr idx = last_idx = a * count + b if var else a diff = 0 - idx if diff >= diff_low: break else: diff_high = idx - last_index if adjust is not None and adjust == -1: break adjust = 1 count += count_incr idx = last_idx = a * count + b if var else a diff = idx - last_index if diff >= diff_high: break diff_high = diff # If a < 0, our count is working backwards, so floor the index by increasing the count. # Find the count that yields the lowest, in bound value and use that. # Lastly reverse count increment so that we'll increase our index. lowest = count if a < 0: while idx >= 1: lowest = count count += count_incr idx = last_idx = a * count + b if var else a count_incr = -1 count = lowest idx = last_idx = a * count + b if var else a # Evaluate elements while our calculated nth index is still in range while 1 <= idx <= last_index + 1: child = None # Evaluate while our child index is still in range. for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): index += factor if not self.is_tag(child): continue # Handle `of S` in `nth-child` if n.selectors and not self.match_selectors(child, n.selectors): continue # Handle `of-type` if n.of_type and not self.match_nth_tag_type(el, child): continue relative_index += 1 if relative_index == idx: if child is el: matched = True else: break if child is el: break if child is el: break last_idx = idx count += count_incr if count < 0: # Count is counting down and has now ventured into invalid territory. break idx = a * count + b if var else a if last_idx == idx: break if not matched: break return matched def match_empty(self, el: bs4.Tag) -> bool: """Check if element is empty (if requested).""" is_empty = True for child in self.get_children(el, tags=False): if self.is_tag(child): is_empty = False break elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): is_empty = False break return is_empty def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: """Match selectors.""" match = True for sel in selectors: if not self.match_selectors(el, sel): match = False return match def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: """Match element if it contains text.""" match = True content = None # type: Optional[str | Sequence[str]] for contain_list in contains: if content is None: if contain_list.own: content = self.get_own_text(el, no_iframe=self.is_html) else: content = self.get_text(el, no_iframe=self.is_html) found = False for text in contain_list.text: if contain_list.own: for c in content: if text in c: found = True break if found: break else: if text in content: found = True break if not found: match = False return match def match_default(self, el: bs4.Tag) -> bool: """Match default.""" match = False # Find this input's form form = None parent = self.get_parent(el, no_iframe=True) while parent and form is None: if self.get_tag(parent) == 'form' and self.is_html_tag(parent): form = parent else: parent = self.get_parent(parent, no_iframe=True) # Look in form cache to see if we've already located its default button found_form = False for f, t in self.cached_default_forms: if f is form: found_form = True if t is el: match = True break # We didn't have the form cached, so look for its default button if not found_form: for child in self.get_descendants(form, no_iframe=True): name = self.get_tag(child) # Can't do nested forms (haven't figured out why we never hit this) if name == 'form': # pragma: no cover break if name in ('input', 'button'): v = self.get_attribute_by_name(child, 'type', '') if v and util.lower(v) == 'submit': self.cached_default_forms.append((form, child)) if el is child: match = True break return match def match_indeterminate(self, el: bs4.Tag) -> bool: """Match default.""" match = False name = cast(str, self.get_attribute_by_name(el, 'name')) def get_parent_form(el: bs4.Tag) -> Optional[bs4.Tag]: """Find this input's form.""" form = None parent = self.get_parent(el, no_iframe=True) while form is None: if self.get_tag(parent) == 'form' and self.is_html_tag(parent): form = parent break last_parent = parent parent = self.get_parent(parent, no_iframe=True) if parent is None: form = last_parent break return form form = get_parent_form(el) # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate found_form = False for f, n, i in self.cached_indeterminate_forms: if f is form and n == name: found_form = True if i is True: match = True break # We didn't have the form cached, so validate that the radio button is indeterminate if not found_form: checked = False for child in self.get_descendants(form, no_iframe=True): if child is el: continue tag_name = self.get_tag(child) if tag_name == 'input': is_radio = False check = False has_name = False for k, v in self.iter_attributes(child): if util.lower(k) == 'type' and util.lower(v) == 'radio': is_radio = True elif util.lower(k) == 'name' and v == name: has_name = True elif util.lower(k) == 'checked': check = True if is_radio and check and has_name and get_parent_form(child) is form: checked = True break if checked: break if not checked: match = True self.cached_indeterminate_forms.append((form, name, match)) return match def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: """Match languages.""" match = False has_ns = self.supports_namespaces() root = self.root has_html_namespace = self.has_html_namespace # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. parent = el found_lang = None last = None while not found_lang: has_html_ns = self.has_html_ns(parent) for k, v in self.iter_attributes(parent): attr_ns, attr = self.split_namespace(parent, k) if ( ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or ( has_ns and not has_html_ns and attr_ns == NS_XML and (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' ) ): found_lang = v break last = parent parent = self.get_parent(parent, no_iframe=self.is_html) if parent is None: root = last has_html_namespace = self.has_html_ns(root) parent = last break # Use cached meta language. if found_lang is None and self.cached_meta_lang: for cache in self.cached_meta_lang: if root is cache[0]: found_lang = cache[1] # If we couldn't find a language, and the document is HTML, look to meta to determine language. if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): # Find head found = False for tag in ('html', 'head'): found = False for child in self.get_children(parent, no_iframe=self.is_html): if self.get_tag(child) == tag and self.is_html_tag(child): found = True parent = child break if not found: # pragma: no cover break # Search meta tags if found: for child in parent: if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): c_lang = False content = None for k, v in self.iter_attributes(child): if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': c_lang = True if util.lower(k) == 'content': content = v if c_lang and content: found_lang = content self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) break if found_lang is not None: break if found_lang is None: self.cached_meta_lang.append((cast(str, root), '')) # If we determined a language, compare. if found_lang is not None: for patterns in langs: match = False for pattern in patterns: if self.extended_language_filter(pattern, cast(str, found_lang)): match = True if not match: break return match def match_dir(self, el: bs4.Tag, directionality: int) -> bool: """Check directionality.""" # If we have to match both left and right, we can't match either. if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: return False if el is None or not self.is_html_tag(el): return False # Element has defined direction of left to right or right to left direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) if direction not in (None, 0): return direction == directionality # Element is the document element (the root) and no direction assigned, assume left to right. is_root = self.is_root(el) if is_root and direction is None: return ct.SEL_DIR_LTR == directionality # If `input[type=telephone]` and no direction is assigned, assume left to right. name = self.get_tag(el) is_input = name == 'input' is_textarea = name == 'textarea' is_bdi = name == 'bdi' itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' if is_input and itype == 'tel' and direction is None: return ct.SEL_DIR_LTR == directionality # Auto handling for text inputs if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: if is_textarea: temp = [] for node in self.get_contents(el, no_iframe=True): if self.is_content_string(node): temp.append(node) value = ''.join(temp) else: value = cast(str, self.get_attribute_by_name(el, 'value', '')) if value: for c in value: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return direction == directionality # Assume left to right return ct.SEL_DIR_LTR == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(self.get_parent(el, no_iframe=True), directionality) # Auto handling for `bdi` and other non text inputs. if (is_bdi and direction is None) or direction == 0: direction = self.find_bidi(el) if direction is not None: return direction == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(self.get_parent(el, no_iframe=True), directionality) # Match parents direction return self.match_dir(self.get_parent(el, no_iframe=True), directionality) def match_range(self, el: bs4.Tag, condition: int) -> bool: """ Match range. Behavior is modeled after what we see in browsers. Browsers seem to evaluate if the value is out of range, and if not, it is in range. So a missing value will not evaluate out of range; therefore, value is in range. Personally, I feel like this should evaluate as neither in or out of range. """ out_of_range = False itype = util.lower(self.get_attribute_by_name(el, 'type')) mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) # There is no valid min or max, so we cannot evaluate a range if mn is None and mx is None: return False value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) if value is not None: if itype in ("date", "datetime-local", "month", "week", "number", "range"): if mn is not None and value < mn: out_of_range = True if not out_of_range and mx is not None and value > mx: out_of_range = True elif itype == "time": if mn is not None and mx is not None and mn > mx: # Time is periodic, so this is a reversed/discontinuous range if value < mn and value > mx: out_of_range = True else: if mn is not None and value < mn: out_of_range = True if not out_of_range and mx is not None and value > mx: out_of_range = True return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range def match_defined(self, el: bs4.Tag) -> bool: """ Match defined. `:defined` is related to custom elements in a browser. - If the document is XML (not XHTML), all tags will match. - Tags that are not custom (don't have a hyphen) are marked defined. - If the tag has a prefix (without or without a namespace), it will not match. This is of course requires the parser to provide us with the proper prefix and namespace info, if it doesn't, there is nothing we can do. """ name = self.get_tag(el) return ( name is not None and ( name.find('-') == -1 or name.find(':') != -1 or self.get_prefix(el) is not None ) ) def match_placeholder_shown(self, el: bs4.Tag) -> bool: """ Match placeholder shown according to HTML spec. - text area should be checked if they have content. A single newline does not count as content. """ match = False content = self.get_text(el) if content in ('', '\n'): match = True return match def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: """Check if element matches one of the selectors.""" match = False is_not = selectors.is_not is_html = selectors.is_html # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. if is_html: namespaces = self.namespaces iframe_restrict = self.iframe_restrict self.namespaces = {'html': NS_XHTML} self.iframe_restrict = True if not is_html or self.is_html: for selector in selectors: match = is_not # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) if isinstance(selector, ct.SelectorNull): continue # Verify tag matches if not self.match_tag(el, selector.tag): continue # Verify tag is defined if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): continue # Verify element is root if selector.flags & ct.SEL_ROOT and not self.match_root(el): continue # Verify element is scope if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): continue # Verify element has placeholder shown if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): continue # Verify `nth` matches if not self.match_nth(el, selector.nth): continue if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): continue # Verify id matches if selector.ids and not self.match_id(el, selector.ids): continue # Verify classes match if selector.classes and not self.match_classes(el, selector.classes): continue # Verify attribute(s) match if not self.match_attributes(el, selector.attributes): continue # Verify ranges if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): continue # Verify language patterns if selector.lang and not self.match_lang(el, selector.lang): continue # Verify pseudo selector patterns if selector.selectors and not self.match_subselectors(el, selector.selectors): continue # Verify relationship selectors if selector.relation and not self.match_relations(el, selector.relation): continue # Validate that the current default selector match corresponds to the first submit button in the form if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): continue # Validate that the unset radio button is among radio buttons with the same name in a form that are # also not set. if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): continue # Validate element directionality if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): continue # Validate that the tag contains the specified text. if selector.contains and not self.match_contains(el, selector.contains): continue match = not is_not break # Restore actual namespaces being used for external selector lists if is_html: self.namespaces = namespaces self.iframe_restrict = iframe_restrict return match def select(self, limit: int = 0) -> Iterator[bs4.Tag]: """Match all tags under the targeted tag.""" lim = None if limit < 1 else limit for child in self.get_descendants(self.tag): if self.match(child): yield child if lim is not None: lim -= 1 if lim < 1: break def closest(self) -> Optional[bs4.Tag]: """Match closest ancestor.""" current = self.tag closest = None while closest is None and current is not None: if self.match(current): closest = current else: current = self.get_parent(current) return closest def filter(self) -> list[bs4.Tag]: # noqa A001 """Filter tag's children.""" return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] def match(self, el: bs4.Tag) -> bool: """Match.""" return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) class SoupSieve(ct.Immutable): """Compiled Soup Sieve selector matching object.""" pattern: str selectors: ct.SelectorList namespaces: Optional[ct.Namespaces] custom: dict[str, str] flags: int __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") def __init__( self, pattern: str, selectors: ct.SelectorList, namespaces: Optional[ct.Namespaces], custom: Optional[ct.CustomSelectors], flags: int ): """Initialize.""" super().__init__( pattern=pattern, selectors=selectors, namespaces=namespaces, custom=custom, flags=flags ) def match(self, tag: bs4.Tag) -> bool: """Match.""" return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) def closest(self, tag: bs4.Tag) -> bs4.Tag: """Match closest ancestor.""" return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 """ Filter. `CSSMatch` can cache certain searches for tags of the same document, so if we are given a tag, all tags are from the same document, and we can take advantage of the optimization. Any other kind of iterable could have tags from different documents or detached tags, so for those, we use a new `CSSMatch` for each item in the iterable. """ if CSSMatch.is_tag(iterable): return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() else: return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] def select_one(self, tag: bs4.Tag) -> bs4.Tag: """Select a single tag.""" tags = self.select(tag, limit=1) return tags[0] if tags else None def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: """Select the specified tags.""" return list(self.iselect(tag, limit)) def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: """Iterate the specified tags.""" for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit): yield el def __repr__(self) -> str: # pragma: no cover """Representation.""" return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format( self.pattern, self.namespaces, self.custom, self.flags ) __str__ = __repr__ ct.pickle_register(SoupSieve)