The Dormouse's story

import pytest import re import warnings from . import ( SoupTest, ) from typing import ( Callable, Optional, Tuple, ) from bs4.element import Tag from bs4.filter import ( AttributeValueMatchRule, ElementFilter, MatchRule, SoupStrainer, StringMatchRule, TagNameMatchRule, ) from bs4._typing import _RawAttributeValues class TestElementFilter(SoupTest): def test_default_behavior(self): # An unconfigured ElementFilter matches absolutely everything. selector = ElementFilter() assert not selector.excludes_everything assert selector.includes_everything soup = self.soup("text") tag = soup.a string = tag.string assert True is selector.match(soup) assert True is selector.match(tag) assert True is selector.match(string) assert soup.find(selector).name == "a" # And allows any incoming markup to be turned into PageElements. assert True is selector.allow_tag_creation(None, "tag", None) assert True is selector.allow_string_creation("some string") def test_setup_with_match_function(self): # Configure an ElementFilter with a match function and # we can no longer state with certainty that it includes everything. selector = ElementFilter(lambda x: False) assert not selector.includes_everything def test_match(self): def m(pe): return pe.string == "allow" or (isinstance(pe, Tag) and pe.name == "allow") soup = self.soup("denyallowdeny") allow_tag = soup.allow allow_string = soup.find(string="allow") deny_tag = soup.deny deny_string = soup.find(string="deny") selector = ElementFilter(match_function=m) assert True is selector.match(allow_tag) assert True is selector.match(allow_string) assert False is selector.match(deny_tag) assert False is selector.match(deny_string) # Since only the match function was provided, there is # no effect on tag or string creation. soup = self.soup("text", parse_only=selector) assert "text" == soup.a.string def test_allow_tag_creation(self): # By default, ElementFilter.allow_tag_creation allows everything. filter = ElementFilter() f = filter.allow_tag_creation assert True is f("allow", "ignore", {}) assert True is f("ignore", "allow", {}) assert True is f(None, "ignore", {"allow": "1"}) assert True is f("no", "no", {"no": "nope"}) # You can customize this behavior by overriding # allow_tag_creation in a subclass. class MyFilter(ElementFilter): def allow_tag_creation( self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues], ): return ( nsprefix == "allow" or name == "allow" or (attrs is not None and "allow" in attrs) ) filter = MyFilter() f = filter.allow_tag_creation assert True is f("allow", "ignore", {}) assert True is f("ignore", "allow", {}) assert True is f(None, "ignore", {"allow": "1"}) assert False is f("no", "no", {"no": "nope"}) # Test the customized ElementFilter as a value for parse_only. soup = self.soup( "deny deny allow", parse_only=filter ) # The tag was filtered out, but there was no effect on # the strings, since only allow_tag_creation_function was # overridden. assert "deny deny allow" == soup.decode() # Similarly, since match_function was not defined, this # ElementFilter matches everything. assert soup.find(filter) == "deny" def test_allow_string_creation(self): # By default, ElementFilter.allow_string_creation allows everything. filter = ElementFilter() f = filter.allow_string_creation assert True is f("allow") assert True is f("deny") assert True is f("please allow") # You can customize this behavior by overriding allow_string_creation # in a subclass. class MyFilter(ElementFilter): def allow_string_creation(self, s: str): return s == "allow" filter = MyFilter() f = filter.allow_string_creation assert True is f("allow") assert False is f("deny") assert False is f("please allow") # Test the customized ElementFilter as a value for parse_only. soup = self.soup( "deny deny allow", parse_only=filter ) # All incoming strings other than "allow" (even whitespace) # were filtered out, but there was no effect on the tags, # since only allow_string_creation_function was defined. assert "denydeny" == soup.decode() # Similarly, since match_function was not defined, this # ElementFilter matches everything. assert soup.find(filter).name == "deny" class TestMatchRule(SoupTest): def _tuple( self, rule: MatchRule ) -> Tuple[Optional[str], Optional[str], Optional[Callable], Optional[bool]]: return ( rule.string, rule.pattern.pattern if rule.pattern else None, rule.function, rule.present, ) @staticmethod def tag_function(x: Tag) -> bool: return False @staticmethod def string_function(x: str) -> bool: return False @pytest.mark.parametrize( "constructor_args, constructor_kwargs, result", [ # String ([], dict(string="a"), ("a", None, None, None)), ( [], dict(string="\N{SNOWMAN}".encode("utf8")), ("\N{SNOWMAN}", None, None, None), ), # Regular expression ([], dict(pattern=re.compile("a")), (None, "a", None, None)), ([], dict(pattern="b"), (None, "b", None, None)), ([], dict(pattern=b"c"), (None, "c", None, None)), # Function ([], dict(function=tag_function), (None, None, tag_function, None)), ([], dict(function=string_function), (None, None, string_function, None)), # Boolean ([], dict(present=True), (None, None, None, True)), # With positional arguments rather than keywords (("a", None, None, None), {}, ("a", None, None, None)), ((None, "b", None, None), {}, (None, "b", None, None)), ((None, None, tag_function, None), {}, (None, None, tag_function, None)), ((None, None, None, True), {}, (None, None, None, True)), ], ) def test_constructor(self, constructor_args, constructor_kwargs, result): rule = MatchRule(*constructor_args, **constructor_kwargs) assert result == self._tuple(rule) def test_empty_match_not_allowed(self): with pytest.raises( ValueError, match="Either string, pattern, function, present, or exclude_everything must be provided.", ): MatchRule() def test_full_match_not_allowed(self): with pytest.raises( ValueError, match="At most one of string, pattern, function, present, and exclude_everything must be provided.", ): MatchRule("a", "b", self.tag_function, True) @pytest.mark.parametrize( "rule_kwargs, match_against, result", [ (dict(string="a"), "a", True), (dict(string="a"), "ab", False), (dict(pattern="a"), "a", True), (dict(pattern="a"), "ab", True), (dict(pattern="^a$"), "a", True), (dict(pattern="^a$"), "ab", False), (dict(present=True), "any random value", True), (dict(present=True), None, False), (dict(present=False), "any random value", False), (dict(present=False), None, True), (dict(function=lambda x: x.upper() == x), "UPPERCASE", True), (dict(function=lambda x: x.upper() == x), "lowercase", False), (dict(function=lambda x: x.lower() == x), "UPPERCASE", False), (dict(function=lambda x: x.lower() == x), "lowercase", True), ], ) def test_matches_string(self, rule_kwargs, match_against, result): rule = MatchRule(**rule_kwargs) assert rule.matches_string(match_against) == result class TestTagNameMatchRule(SoupTest): @pytest.mark.parametrize( "rule_kwargs, tag_kwargs, result", [ (dict(string="a"), dict(name="a"), True), (dict(string="a"), dict(name="ab"), False), (dict(pattern="a"), dict(name="a"), True), (dict(pattern="a"), dict(name="ab"), True), (dict(pattern="^a$"), dict(name="a"), True), (dict(pattern="^a$"), dict(name="ab"), False), # This isn't very useful, but it will work. (dict(present=True), dict(name="any random value"), True), (dict(present=False), dict(name="any random value"), False), ( dict(function=lambda t: t.name in t.attrs), dict(name="id", attrs=dict(id="a")), True, ), ( dict(function=lambda t: t.name in t.attrs), dict(name="id", attrs={"class": "a"}), False, ), ], ) def test_matches_tag(self, rule_kwargs, tag_kwargs, result): rule = TagNameMatchRule(**rule_kwargs) tag = Tag(**tag_kwargs) assert rule.matches_tag(tag) == result def test_matches_tag_only_passes_tag_to_function(self): def arg1_must_be_tag(t): if not isinstance(t, Tag): raise ValueError("Non-tag passed into tag name match rule function") return True rule = TagNameMatchRule(function=arg1_must_be_tag) assert rule.matches_tag(Tag(name="tagname")) is True # AttributeValueMatchRule and StringMatchRule have the same # logic as MatchRule. class TestSoupStrainer(SoupTest): def test_constructor_string_deprecated_text_argument(self): with warnings.catch_warnings(record=True) as w: strainer = SoupStrainer(text="text") assert strainer.text == "text" [w1, w2] = w msg = str(w1.message) assert w1.filename == __file__ assert ( msg == "As of version 4.11.0, the 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead." ) msg = str(w2.message) assert w2.filename == __file__ assert ( msg == "Access to deprecated property text. (Look at .string_rules instead) -- Deprecated since version 4.13.0." ) def test_search_tag_deprecated(self): strainer = SoupStrainer(name="a") with warnings.catch_warnings(record=True) as w: assert False is strainer.search_tag("b", {}) [w1] = w msg = str(w1.message) assert w1.filename == __file__ assert ( msg == "Call to deprecated method search_tag. (Replaced by allow_tag_creation) -- Deprecated since version 4.13.0." ) def test_search_deprecated(self): strainer = SoupStrainer(name="a") soup = self.soup("") with warnings.catch_warnings(record=True) as w: assert soup.a == strainer.search(soup.a) assert None is strainer.search(soup.b) [w1, w2] = w msg = str(w1.message) assert msg == str(w2.message) assert w1.filename == __file__ assert ( msg == "Call to deprecated method search. (Replaced by match) -- Deprecated since version 4.13.0." ) # Dummy function used within tests. def _match_function(x): pass def test_constructor_default(self): # The default SoupStrainer matches all tags, and only tags. strainer = SoupStrainer() [name_rule] = strainer.name_rules assert True == name_rule.present assert 0 == len(strainer.attribute_rules) assert 0 == len(strainer.string_rules) def test_constructor(self): strainer = SoupStrainer( "tagname", {"attr1": "value"}, string=self._match_function, attr2=["value1", False], ) [name_rule] = strainer.name_rules assert name_rule == TagNameMatchRule(string="tagname") [attr1_rule] = strainer.attribute_rules.pop("attr1") assert attr1_rule == AttributeValueMatchRule(string="value") [attr2_rule1, attr2_rule2] = strainer.attribute_rules.pop("attr2") assert attr2_rule1 == AttributeValueMatchRule(string="value1") assert attr2_rule2 == AttributeValueMatchRule(present=False) assert not strainer.attribute_rules [string_rule] = strainer.string_rules assert string_rule == StringMatchRule(function=self._match_function) def test_scalar_attrs_becomes_class_restriction(self): # For the sake of convenience, passing a scalar value as # ``args`` results in a restriction on the 'class' attribute. strainer = SoupStrainer(attrs="mainbody") assert [] == strainer.name_rules assert [] == strainer.string_rules assert {"class": [AttributeValueMatchRule(string="mainbody")]} == ( strainer.attribute_rules ) def test_constructor_class_attribute(self): # The 'class' HTML attribute is also treated specially because # it's a Python reserved word. Passing in "class_" as a # keyword argument results in a restriction on the 'class' # attribute. strainer = SoupStrainer(class_="mainbody") assert [] == strainer.name_rules assert [] == strainer.string_rules assert {"class": [AttributeValueMatchRule(string="mainbody")]} == ( strainer.attribute_rules ) # But if you pass in "class_" as part of the ``attrs`` dict # it's not changed. (Otherwise there'd be no way to actually put # a restriction on an attribute called "class_".) strainer = SoupStrainer(attrs=dict(class_="mainbody")) assert [] == strainer.name_rules assert [] == strainer.string_rules assert {"class_": [AttributeValueMatchRule(string="mainbody")]} == ( strainer.attribute_rules ) def test_constructor_with_overlapping_attributes(self): # If you specify the same attribute in args and **kwargs, you end up # with two different AttributeValueMatchRule objects. # This happens whether you use the 'class' shortcut on attrs... strainer = SoupStrainer(attrs="class1", class_="class2") rule1, rule2 = strainer.attribute_rules["class"] assert rule1.string == "class1" assert rule2.string == "class2" # Or explicitly specify the same attribute twice. strainer = SoupStrainer(attrs={"id": "id1"}, id="id2") rule1, rule2 = strainer.attribute_rules["id"] assert rule1.string == "id1" assert rule2.string == "id2" @pytest.mark.parametrize( "obj, result", [ ("a", MatchRule(string="a")), (b"a", MatchRule(string="a")), (True, MatchRule(present=True)), (False, MatchRule(present=False)), (re.compile("a"), MatchRule(pattern=re.compile("a"))), (_match_function, MatchRule(function=_match_function)), # Pass in a list and get back a list of rules. (["a", b"b"], [MatchRule(string="a"), MatchRule(string="b")]), ( [re.compile("a"), _match_function], [ MatchRule(pattern=re.compile("a")), MatchRule(function=_match_function), ], ), # Anything that doesn't fit is converted to a string. (100, MatchRule(string="100")), ], ) def test__make_match_rules(self, obj, result): actual = list(SoupStrainer._make_match_rules(obj, MatchRule)) # Helper to reduce the number of single-item lists in the # parameters. if len(actual) == 1: [actual] = actual assert result == actual @pytest.mark.parametrize( "cls, result", [ (AttributeValueMatchRule, AttributeValueMatchRule(string="a")), (StringMatchRule, StringMatchRule(string="a")), ], ) def test__make_match_rules_different_classes(self, cls, result): actual = cls(string="a") assert actual == result def test__make_match_rules_nested_list(self): # If you pass a nested list into _make_match_rules, it's # turned into a restriction that excludes everything, to avoid the # possibility of an infinite recursion. # Create a self-referential object. selfref = [] selfref.append(selfref) with warnings.catch_warnings(record=True) as w: rules = SoupStrainer._make_match_rules(["a", selfref, "b"], MatchRule) assert list(rules) == [MatchRule(string="a"), MatchRule(exclude_everything=True), MatchRule(string="b")] [warning] = w # Don't check the filename because the stacklevel is # designed for normal use and we're testing the private # method directly. msg = str(warning.message) assert ( msg == "Ignoring nested list [[...]] to avoid the possibility of infinite recursion." ) def tag_matches( self, strainer: SoupStrainer, name: str, attrs: Optional[_RawAttributeValues] = None, string: Optional[str] = None, prefix: Optional[str] = None, ) -> bool: # Create a Tag with the given prefix, name and attributes, # then make sure that strainer.matches_tag and allow_tag_creation # both approve it. tag = Tag(prefix=prefix, name=name, attrs=attrs) if string: tag.string = string return strainer.matches_tag(tag) and strainer.allow_tag_creation( prefix, name, attrs ) def test_matches_tag_with_only_string(self): # A SoupStrainer that only has StringMatchRules won't ever # match a Tag. strainer = SoupStrainer(string=["a string", re.compile("string")]) tag = Tag(name="b", attrs=dict(id="1")) tag.string = "a string" assert not strainer.matches_tag(tag) # There has to be a TagNameMatchRule or an # AttributeValueMatchRule as well. strainer.name_rules.append(TagNameMatchRule(string="b")) assert strainer.matches_tag(tag) strainer.name_rules = [] strainer.attribute_rules["id"] = [AttributeValueMatchRule("1")] assert strainer.matches_tag(tag) def test_matches_tag_with_prefix(self): # If a tag has an attached namespace prefix, the tag's name is # tested both with and without the prefix. kwargs = dict(name="a", prefix="ns") assert self.tag_matches(SoupStrainer(name="a"), **kwargs) assert self.tag_matches(SoupStrainer(name="ns:a"), **kwargs) assert not self.tag_matches(SoupStrainer(name="ns2:a"), **kwargs) def test_one_name_rule_must_match(self): # If there are TagNameMatchRule, at least one must match. kwargs = dict(name="b") assert self.tag_matches(SoupStrainer(name="b"), **kwargs) assert not self.tag_matches(SoupStrainer(name="c"), **kwargs) assert self.tag_matches(SoupStrainer(name=["c", "d", "d", "b"]), **kwargs) assert self.tag_matches( SoupStrainer(name=[re.compile("c-f"), re.compile("[ab]$")]), **kwargs ) def test_one_attribute_rule_must_match_for_each_attribute(self): # If there is one or more AttributeValueMatchRule for a given # attribute, at least one must match that attribute's # value. This is true for *every* attribute -- just matching one # attribute isn't enough. kwargs = dict(name="b", attrs={"class": "main", "id": "1"}) # 'class' and 'id' match assert self.tag_matches( SoupStrainer( class_=["other", "main"], id=["20", "a", re.compile("^[0-9]")] ), **kwargs, ) # 'class' and 'id' are present and 'data' attribute is missing assert self.tag_matches( SoupStrainer(class_=True, id=True, data=False), **kwargs ) # 'id' matches, 'class' does not. assert not self.tag_matches(SoupStrainer(class_=["other"], id=["2"]), **kwargs) # 'class' matches, 'id' does not assert not self.tag_matches(SoupStrainer(class_=["main"], id=["2"]), **kwargs) # 'class' and 'id' match but 'data' attribute is missing assert not self.tag_matches( SoupStrainer(class_=["main"], id=["1"], data=True), **kwargs ) def test_match_against_multi_valued_attribute(self): # If an attribute has multiple values, only one of them # has to match the AttributeValueMatchRule. kwargs = dict(name="b", attrs={"class": ["main", "big"]}) assert self.tag_matches(SoupStrainer(attrs="main"), **kwargs) assert self.tag_matches(SoupStrainer(attrs="big"), **kwargs) assert self.tag_matches(SoupStrainer(attrs=["main", "big"]), **kwargs) assert self.tag_matches(SoupStrainer(attrs=["big", "small"]), **kwargs) assert not self.tag_matches(SoupStrainer(attrs=["small", "smaller"]), **kwargs) def test_match_against_multi_valued_attribute_as_string(self): # If an attribute has multiple values, you can treat the entire # thing as one string during a match. kwargs = dict(name="b", attrs={"class": ["main", "big"]}) assert self.tag_matches(SoupStrainer(attrs="main big"), **kwargs) # But you can't put them in any order; it's got to be the # order they are present in the Tag, which basically means the # order they were originally present in the document. assert not self.tag_matches(SoupStrainer(attrs=["big main"]), **kwargs) def test_one_string_rule_must_match(self): # If there's a TagNameMatchRule and/or an # AttributeValueMatchRule, then the StringMatchRule is _not_ # ignored, and must match as well. tag = Tag(name="b", attrs=dict(id="1")) tag.string = "A string" assert SoupStrainer(name="b", string="A string").matches_tag(tag) assert not SoupStrainer(name="a", string="A string").matches_tag(tag) assert not SoupStrainer(name="a", string="Wrong string").matches_tag(tag) assert SoupStrainer(id="1", string="A string").matches_tag(tag) assert not SoupStrainer(id="2", string="A string").matches_tag(tag) assert not SoupStrainer(id="1", string="Wrong string").matches_tag(tag) assert SoupStrainer(name="b", id="1", string="A string").matches_tag(tag) # If there are multiple string rules, only one needs to match. assert SoupStrainer( name="b", id="1", string=["Wrong string", "Also wrong", re.compile("string")], ).matches_tag(tag) def test_allowing_tag_implies_allowing_its_contents(self): markup = "one string
another string
" # Letting the tag through implies parsing the
tag # and both strings, even though they wouldn't match the # SoupStrainer on their own. assert ( "one string
another string
" == self.soup(markup, parse_only=SoupStrainer(name="b")).decode() ) @pytest.mark.parametrize( "soupstrainer", [ SoupStrainer(name="b", string="one string"), SoupStrainer(name="div", string="another string"), ], ) def test_parse_only_combining_tag_and_string(self, soupstrainer): # If you pass parse_only a SoupStrainer that contains both tag # restrictions and string restrictions, you get no results, # because the string restrictions can't be evaluated during # the parsing process, and the tag restrictions eliminate # any strings from consideration. # # We can detect this ahead of time, and warn about it, # thanks to SoupStrainer.excludes_everything markup = "one string
another string
" with warnings.catch_warnings(record=True) as w: assert True, soupstrainer.excludes_everything assert "" == self.soup(markup, parse_only=soupstrainer).decode() [warning] = w str(warning.message) assert warning.filename == __file__ assert str(warning.message).startswith( "The given value for parse_only will exclude everything:" ) # The average SoupStrainer has excludes_everything=False assert not SoupStrainer().excludes_everything def test_documentation_examples(self): """Medium-weight real-world tests based on the Beautiful Soup documentation. """ html_doc = """The Dormouse's story
The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...
""" only_a_tags = SoupStrainer("a") only_tags_with_id_link2 = SoupStrainer(id="link2") def is_short_string(string): return string is not None and len(string) < 10 only_short_strings = SoupStrainer(string=is_short_string) a_soup = self.soup(html_doc, parse_only=only_a_tags) assert ( 'Elsie Lacie Tillie' == a_soup.decode() ) id_soup = self.soup(html_doc, parse_only=only_tags_with_id_link2) assert ( 'Lacie' == id_soup.decode() ) string_soup = self.soup(html_doc, parse_only=only_short_strings) assert "\n\n\nElsie,\nLacie and\nTillie\n...\n" == string_soup.decode()