1
1
mirror of https://git.launchpad.net/beautifulsoup synced 2025-10-06 00:12:49 +02:00

Compare commits

...

36 Commits

Author SHA1 Message Date
Leonard Richardson
c7a09cafe5 Mention pyright instead of PyLance. 2025-09-27 13:16:42 -04:00
Leonard Richardson
9a7cad9a6d Added a type-checking smoke test. 2025-09-27 13:16:17 -04:00
Leonard Richardson
5fc1a2b87d Added some tests of ResultSet's new features. 2025-09-27 13:12:49 -04:00
Leonard Richardson
d2286be163 Added a link to the pyright issue behind the suppression of the pyright warning that makes overloading possible. 2025-09-27 13:04:12 -04:00
Leonard Richardson
6ea6df0753 Remove filter_tags, filter_strings, find_tag, find_string, find_all_tags, and find_all_strings from the ElementFilter API. Hopefully the overloads will make them unnecessary. If they turn out to be necessary, this commit is where the code is. 2025-09-27 13:00:28 -04:00
Leonard Richardson
e34c6fc6b9 Updated CHANGELOG in preparation for merge. 2025-09-27 12:45:49 -04:00
Leonard Richardson
bf17d951d2 FINALLY got all of the overloads working with both mypy and pyright. 2025-09-14 13:02:25 -04:00
Leonard Richardson
cd548aaed4 Overloaded find_all_next. 2025-09-14 12:57:45 -04:00
Leonard Richardson
f58228ff0b Overloaded find_all_previous. 2025-09-14 12:56:33 -04:00
Leonard Richardson
222984beae Mention the attrs change, just in case. 2025-09-14 12:47:36 -04:00
Leonard Richardson
da76f83d8a Mention the attrs change, just in case. 2025-09-14 12:44:10 -04:00
Leonard Richardson
a7cb7745f7 Updated changelog with an accurate list of methods that have changed. 2025-09-14 12:43:37 -04:00
Leonard Richardson
f723296e22 find_parents can return _SomeTags. 2025-09-14 12:27:17 -04:00
Leonard Richardson
f320fa4fce find_parent doesn't need overloads. 2025-09-14 12:25:17 -04:00
Leonard Richardson
23c15c2fb1 Overloaded find_previous. 2025-09-14 12:18:07 -04:00
Leonard Richardson
9bae5a1201 Overloaded find_previous. 2025-09-14 12:16:18 -04:00
Leonard Richardson
4c5cefec13 Overloaded find_next_sibling 2025-09-14 12:13:47 -04:00
Leonard Richardson
0e3199db71 Going back to this after comparing source code with other versions. 2025-09-14 12:13:11 -04:00
Leonard Richardson
d225e021b9 @overload of any additional find_* methods isn't working, I think due to false positives from pyright. 2025-09-14 12:09:11 -04:00
Leonard Richardson
94408c845d Overloaded find_next. 2025-09-14 10:58:07 -04:00
Leonard Richardson
7c06afd400 That pyright warning is ignorable, I think. 2025-09-14 09:20:42 -04:00
Leonard Richardson
c1c6162581 Fixed bad use of Union. 2025-09-13 12:00:39 -04:00
Leonard Richardson
4f44b052a1 Merge branch 'master' into overloading-find 2025-09-13 10:44:10 -04:00
Leonard Richardson
cd24e0084b OK, I can't use the aliases there I guess. 2025-08-09 11:58:49 -04:00
Leonard Richardson
7f4b643e98 Added the function overloading to the changelog. 2025-08-09 11:53:53 -04:00
Leonard Richardson
62566d8d48 Documented filter_tags and filter_strings. 2025-08-09 11:03:50 -04:00
Leonard Richardson
6ba889aa56 Sequence, not list. 2025-08-09 10:28:32 -04:00
Leonard Richardson
c909c29d12 This seems to resolve the remaining problems around overloading. 2025-08-03 12:43:16 -04:00
Leonard Richardson
977ec996fe Merge branch 'master' into overloading-find 2025-07-26 11:44:21 -04:00
Leonard Richardson
57e5074727 Merge branch 'master' into overloading-find 2025-07-26 09:37:05 -04:00
Leonard Richardson
b06c53ddc1 I'm pretty sure whatever the solution is requires these extra methods at the lowest level. 2025-05-27 17:51:18 -04:00
Leonard Richardson
951992740d Make ResultSet implement the abstract methods of Sequence. 2025-05-26 14:58:20 -04:00
Leonard Richardson
1d51b7ce0d OK, let's start by just trying to get find_all and find right. 2025-05-26 11:49:29 -04:00
Leonard Richardson
41280f027f Merge branch 'master' into overloading-find 2025-05-26 11:45:34 -04:00
Leonard Richardson
d17a44f321 OK, I think there is an overload signature that works for almost all cases. 2025-05-26 11:13:05 -04:00
Leonard Richardson
4e69cb357c Starting on a project to add overloaded signatures to the find() methods. 2025-05-26 08:51:36 -04:00
10 changed files with 567 additions and 63 deletions

View File

@@ -1,9 +1,102 @@
= Unreleased
= 4.14.0 (20250927)
* This version adds function overloading to the find_* methods to make
it easier to write type-safe Python.
In most cases you can just assign the result of a find() or
find_all() call to the type of object you're expecting to get back:
a Tag, a NavigableString, a Sequence[Tag], or a
Sequence[NavigableString]. It's very rare that you'll have to do a
cast or suppress type-checker warnings like you did in previous
versions of Beautiful Soup.
(In fact, the only time you should still have to do this is if you
pass both 'string' and one of the other arguments into one of the
find* methods, e.g. tag.find("a", string="tag contents".)
The following code has been verified to pass type checking using
mypy, pyright, and the Visual Studio Code IDE. It's available in
the source repository as scripts/type_checking_smoke_test.py.
---
from typing import Optional, Sequence
from bs4 import BeautifulSoup, Tag, NavigableString
soup = BeautifulSoup("<p>", 'html.parser')
tag:Optional[Tag]
string:Optional[NavigableString]
tags:Sequence[Tag]
strings:Sequence[NavigableString]
tag = soup.find()
tag = soup.find(id="a")
string = soup.find(string="b")
tags = soup()
tags = soup(id="a")
strings = soup(string="b")
tags = soup.find_all()
tags = soup.find_all(id="a")
strings = soup.find_all(string="b")
tag = soup.find_next()
tag = soup.find_next(id="a")
string = soup.find_next(string="b")
tags = soup.find_all_next()
tags = soup.find_all_next(id="a")
strings = soup.find_all_next(string="b")
tag = soup.find_next_sibling()
tag = soup.find_next_sibling(id="a")
string = soup.find_next_sibling(string="b")
tags = soup.find_next_siblings()
tags = soup.find_next_siblings(id="a")
strings = soup.find_next_siblings(string="b")
tag = soup.find_previous()
tag = soup.find_previous(id="a")
string = soup.find_previous(string="b")
tags = soup.find_all_previous()
tags = soup.find_all_previous(id="a")
strings = soup.find_all_previous(string="b")
tag = soup.find_previous_sibling()
tag = soup.find_previous_sibling(id="a")
string = soup.find_previous_sibling(string="bold")
tags = soup.find_previous_siblings()
tags = soup.find_previous_siblings(id="a")
strings = soup.find_previous_siblings(string="b")
tag = soup.find_parent()
tag = soup.find_parent(id="a")
tags = soup.find_parents()
tags = soup.find_parents(id="a")
# This code will work, but mypy and pyright will both flag it.
tags = soup.find_all("a", string="b")
---
* The typing for find_parent() and find_parents() was improved without
any overloading. Casts should never be necessary, since those
methods only ever return Tag and ResultSet[Tag], respectively.
* ResultSet now inherits from Sequence. This should make it easier to
incorporate ResultSet objects into your type system without needing to
handle ResultSet specially.
* Fixed an unhandled exception when creating the string representation of
a decomposed element. (The output is not *useful* and you still
shouldn't do this, but it won't raise an exception anymore.) [bug=2120300]
* The default value for the 'attrs' attribute in find* methods is now
None, not the empty dictionary. This should have no visible effect
on anything.
= 4.13.5 (20250824)
* Fixed an unhandled exception when parsing invalid markup that contains the { character

View File

@@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.13.5"
__version__ = "4.14.0"
__copyright__ = "Copyright (c) 2004-2025 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"

View File

@@ -198,4 +198,8 @@ _StrainableAttributes: TypeAlias = Dict[str, _StrainableAttribute]
#: are available on the objects they're dealing with.
_OneElement: TypeAlias = Union["PageElement", "Tag", "NavigableString"]
_AtMostOneElement: TypeAlias = Optional[_OneElement]
_AtMostOneTag: TypeAlias = Optional["Tag"]
_AtMostOneNavigableString: TypeAlias = Optional["NavigableString"]
_QueryResults: TypeAlias = "ResultSet[_OneElement]"
_SomeTags: TypeAlias = "ResultSet[Tag]"
_SomeNavigableStrings: TypeAlias = "ResultSet[NavigableString]"

View File

@@ -20,6 +20,7 @@ from typing import (
cast,
Iterable,
Iterator,
Sequence,
Optional,
TYPE_CHECKING,
)
@@ -88,7 +89,7 @@ class CSS(object):
ns = self.tag._namespaces
return ns
def _rs(self, results: Iterable[Tag]) -> ResultSet[Tag]:
def _rs(self, results: Sequence[Tag]) -> ResultSet[Tag]:
"""Normalize a list of results to a py:class:`ResultSet`.
A py:class:`ResultSet` is more consistent with the rest of

View File

@@ -30,6 +30,7 @@ from typing import (
Mapping,
Optional,
Pattern,
Sequence,
Set,
TYPE_CHECKING,
Tuple,
@@ -54,6 +55,8 @@ if TYPE_CHECKING:
)
from bs4._typing import (
_AtMostOneElement,
_AtMostOneTag,
_AtMostOneNavigableString,
_AttributeValue,
_AttributeValues,
_Encoding,
@@ -65,6 +68,8 @@ if TYPE_CHECKING:
_StrainableAttribute,
_StrainableAttributes,
_StrainableString,
_SomeNavigableStrings,
_SomeTags,
)
_OneOrMoreStringTypes: TypeAlias = Union[
@@ -746,13 +751,35 @@ class PageElement(object):
return results
# For the suppression of this pyright warning, see discussion here:
# https://github.com/microsoft/pyright/issues/10929
@overload
def find_next( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None=None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneTag:
...
@overload
def find_next(
self,
name: None=None,
attrs: None=None,
string: _StrainableString="",
**kwargs: _StrainableAttribute,
) -> _AtMostOneNavigableString:
...
def find_next(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
"""Find the first PageElement that matches the given criteria and
appears later in the document than this PageElement.
@@ -768,15 +795,39 @@ class PageElement(object):
findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
@overload
def find_all_next( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeTags:
...
@overload
def find_all_next(
self,
name: None = None,
attrs: None = None,
string: _StrainableString = "",
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeNavigableStrings:
...
def find_all_next(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
"""Find all `PageElement` objects that match the given criteria and
appear later in the document than this `PageElement`.
@@ -802,13 +853,33 @@ class PageElement(object):
findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
@overload
def find_next_sibling( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None=None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneTag:
...
@overload
def find_next_sibling(
self,
name: None=None,
attrs: None=None,
string: _StrainableString="",
**kwargs: _StrainableAttribute,
) -> _AtMostOneNavigableString:
...
def find_next_sibling(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
"""Find the closest sibling to this PageElement that matches the
given criteria and appears later in the document.
@@ -826,15 +897,39 @@ class PageElement(object):
"findNextSibling", "find_next_sibling", "4.0.0"
)
@overload
def find_next_siblings( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeTags:
...
@overload
def find_next_siblings(
self,
name: None = None,
attrs: None = None,
string: _StrainableString = "",
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeNavigableStrings:
...
def find_next_siblings(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
"""Find all siblings of this `PageElement` that match the given criteria
and appear later in the document.
@@ -865,13 +960,33 @@ class PageElement(object):
"fetchNextSiblings", "find_next_siblings", "3.0.0"
)
@overload
def find_previous( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None=None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneTag:
...
@overload
def find_previous(
self,
name: None=None,
attrs: None=None,
string: _StrainableString="",
**kwargs: _StrainableAttribute,
) -> _AtMostOneNavigableString:
...
def find_previous(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
"""Look backwards in the document from this `PageElement` and find the
first `PageElement` that matches the given criteria.
@@ -887,15 +1002,39 @@ class PageElement(object):
findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
@overload
def find_all_previous( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeTags:
...
@overload
def find_all_previous(
self,
name: None = None,
attrs: None = None,
string: _StrainableString = "",
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeNavigableStrings:
...
def find_all_previous(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
"""Look backwards in the document from this `PageElement` and find all
`PageElement` that match the given criteria.
@@ -926,13 +1065,33 @@ class PageElement(object):
"fetchAllPrevious", "find_all_previous", "3.0.0"
)
@overload
def find_previous_sibling( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None=None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneTag:
...
@overload
def find_previous_sibling(
self,
name: None=None,
attrs: None=None,
string: _StrainableString="",
**kwargs: _StrainableAttribute,
) -> _AtMostOneNavigableString:
...
def find_previous_sibling(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
"""Returns the closest sibling to this `PageElement` that matches the
given criteria and appears earlier in the document.
@@ -952,15 +1111,39 @@ class PageElement(object):
"findPreviousSibling", "find_previous_sibling", "4.0.0"
)
@overload
def find_previous_siblings( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
string: None = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeTags:
...
@overload
def find_previous_siblings(
self,
name: None = None,
attrs: None = None,
string: _StrainableString = "",
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeNavigableStrings:
...
def find_previous_siblings(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
"""Returns all siblings to this PageElement that match the
given criteria and appear earlier in the document.
@@ -994,9 +1177,9 @@ class PageElement(object):
def find_parent(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
) -> _AtMostOneTag:
"""Find the closest parent of this PageElement that matches the given
criteria.
@@ -1024,11 +1207,11 @@ class PageElement(object):
def find_parents(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> _SomeTags:
"""Find all parents of this `PageElement` that match the given criteria.
All find_* methods take a common set of arguments. See the online
@@ -1041,9 +1224,11 @@ class PageElement(object):
:kwargs: Additional filters on attribute values.
"""
iterator = self.parents
return self._find_all(
# Only Tags can have children, so this ResultSet will contain
# nothing but Tags.
return cast(ResultSet[Tag], self._find_all(
name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
)
))
findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
@@ -1068,7 +1253,7 @@ class PageElement(object):
# specific here.
method: Callable,
name: _FindMethodName,
attrs: _StrainableAttributes,
attrs: Optional[_StrainableAttributes],
string: Optional[_StrainableString],
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
@@ -1081,7 +1266,7 @@ class PageElement(object):
def _find_all(
self,
name: _FindMethodName,
attrs: _StrainableAttributes,
attrs: Optional[_StrainableAttributes],
string: Optional[_StrainableString],
limit: Optional[int],
generator: Iterator[PageElement],
@@ -1120,7 +1305,7 @@ class PageElement(object):
if string is None and not limit and not attrs and not kwargs:
if name is True or name is None:
# Optimization to find all tags.
result = (element for element in generator if isinstance(element, Tag))
result = [element for element in generator if isinstance(element, Tag)]
return ResultSet(matcher, result)
elif isinstance(name, str):
# Optimization to find all tags with a given name.
@@ -2239,22 +2424,63 @@ class Tag(PageElement):
"Deleting tag[key] deletes all 'key' attributes for the tag."
self.attrs.pop(key, None)
@overload
def __call__( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: None = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeTags:
...
@overload
def __call__(
self,
name: Optional[_StrainableElement] = None,
attrs: _StrainableAttributes = {},
name: None = None,
attrs: None = None,
recursive: bool = True,
string: _StrainableString = "",
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeNavigableStrings:
...
def __call__(
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
"""Calling a Tag like a function is the same as calling its
find_all() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
return self.find_all(
name, attrs, recursive, string, limit, _stacklevel, **kwargs
if string is not None and (name is not None or attrs is not None or kwargs):
# TODO: Using the @overload decorator to express the three ways you
# could get into this path is way too much code for a rarely(?) used
# feature.
return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore
if string is None:
# If string is None, we're searching for tags.
tags:ResultSet[Tag] = self.find_all(
name, attrs, recursive, None, limit, _stacklevel, **kwargs
)
return tags
# Otherwise, we're searching for strings.
strings:ResultSet[NavigableString] = self.find_all(
None, None, recursive, string, limit, _stacklevel, **kwargs
)
return strings
def __getattr__(self, subtag: str) -> Optional[Tag]:
"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
@@ -2277,7 +2503,7 @@ class Tag(PageElement):
raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__, subtag)
)
return cast(Optional[Tag], result)
return result
def __eq__(self, other: Any) -> bool:
"""Returns true iff this Tag has the same name, the same attributes,
@@ -2707,14 +2933,35 @@ class Tag(PageElement):
# Soup methods
@overload
def find(
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: None=None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneTag:
...
@overload
def find(
self,
name: None=None,
attrs: None=None,
recursive: bool = True,
string: _StrainableString="",
) -> _AtMostOneNavigableString:
...
def find(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
"""Look in the children of this PageElement and find the first
PageElement that matches the given criteria.
@@ -2727,27 +2974,63 @@ class Tag(PageElement):
recursive search of this Tag's children. Otherwise,
only the direct children will be considered.
:param string: A filter on the `Tag.string` attribute.
:param limit: Stop looking after finding this many results.
:kwargs: Additional filters on attribute values.
"""
r = None
results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs)
if results:
r = results[0]
return r
if string is not None and (name is not None or attrs is not None or kwargs):
# TODO: Using the @overload decorator to express the three ways you
# could get into this path is way too much code for a rarely(?) used
# feature.
elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore
if elements:
return cast(Tag, elements[0])
elif string is None:
tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)
if tags:
return cast(Tag, tags[0])
else:
strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)
if strings:
return cast(NavigableString, strings[0])
return None
findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
@overload
def find_all( # pyright: ignore [reportOverlappingOverload]
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: None = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeTags:
...
@overload
def find_all(
self,
name: None = None,
attrs: None = None,
recursive: bool = True,
string: _StrainableString = "",
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _SomeNavigableStrings:
...
def find_all(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
) -> _QueryResults:
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
"""Look in the children of this `PageElement` and find all
`PageElement` objects that match the given criteria.
@@ -2766,9 +3049,27 @@ class Tag(PageElement):
generator = self.descendants
if not recursive:
generator = self.children
return self._find_all(
name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs
)
_stacklevel += 1
if string is not None and (name is not None or attrs is not None or kwargs):
# TODO: Using the @overload decorator to express the three ways you
# could get into this path is way too much code for a rarely(?) used
# feature.
return cast(ResultSet[Tag],
self._find_all(name, attrs, string, limit, generator,
_stacklevel=_stacklevel, **kwargs)
)
if string is None:
# If string is None, we're searching for tags.
return cast(ResultSet[Tag], self._find_all(
name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs
))
# Otherwise, we're searching for strings.
return cast(ResultSet[NavigableString], self._find_all(
None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs
))
findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
@@ -2884,26 +3185,39 @@ class Tag(PageElement):
_PageElementT = TypeVar("_PageElementT", bound=PageElement)
class ResultSet(List[_PageElementT], Generic[_PageElementT]):
"""A ResultSet is a list of `PageElement` objects, gathered as the result
class ResultSet(Sequence[_PageElementT], Generic[_PageElementT]):
"""A ResultSet is a sequence of `PageElement` objects, gathered as the result
of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
search results.
"""
source: Optional[ElementFilter]
result: Sequence[_PageElementT]
def __init__(
self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
self, source: Optional[ElementFilter], result: Sequence[_PageElementT] = ()
) -> None:
super(ResultSet, self).__init__(result)
self.result = result
self.source = source
def __len__(self) -> int:
return len(self.result)
def __getitem__(self, index):
return self.result[index]
def __getattr__(self, key: str) -> None:
"""Raise a helpful exception to explain a common code fix."""
raise AttributeError(
f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
f"""ResultSet object has no attribute "{key}". You're probably treating a sequence of elements like a single element. Did you call find_all() when you meant to call find()?"""
)
def __eq__(self, other: Any) -> bool:
"""A ResultSet is equal to a list if its results are equal to that list.
A ResultSet is equal to another ResultSet if their results are equal,
even if the results come from different sources.
"""
return bool(self.result == other)
# Now that all the classes used by SoupStrainer have been defined,
# import SoupStrainer itself into this module to preserve the

View File

@@ -136,8 +136,7 @@ class ElementFilter(object):
# If there are no rules at all, don't bother filtering. Let
# anything through.
if self.includes_everything:
for i in generator:
yield i
yield from generator
while True:
try:
i = next(generator)
@@ -175,12 +174,12 @@ class ElementFilter(object):
:param limit: Stop looking after finding this many results.
"""
results: _QueryResults = ResultSet(self)
results = []
for match in self.filter(generator):
results.append(match)
if limit is not None and len(results) >= limit:
break
return results
return ResultSet(self, results)
def allow_tag_creation(
self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues]
@@ -379,7 +378,7 @@ class SoupStrainer(ElementFilter):
def __init__(
self,
name: Optional[_StrainableElement] = None,
attrs: Dict[str, _StrainableAttribute] = {},
attrs: Optional[Dict[str, _StrainableAttribute]] = None,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
):
@@ -397,11 +396,13 @@ class SoupStrainer(ElementFilter):
# that matches all Tags, and only Tags.
self.name_rules = [TagNameMatchRule(present=True)]
else:
self.name_rules = cast(
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
)
self.name_rules = cast(
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
)
self.attribute_rules = defaultdict(list)
if attrs is None:
attrs = {}
if not isinstance(attrs, dict):
# Passing something other than a dictionary as attrs is
# sugar for matching that thing against the 'class'

View File

@@ -13,6 +13,7 @@ from bs4.element import (
NamespacedAttribute,
ResultSet,
)
from bs4.filter import ElementFilter
class TestNamedspacedAttribute:
def test_name_may_be_none_or_missing(self):
@@ -133,6 +134,33 @@ class TestResultSet:
with pytest.raises(AttributeError) as e:
rs.name
assert (
"""ResultSet object has no attribute "name". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
"""ResultSet object has no attribute "name". You're probably treating a sequence of elements like a single element. Did you call find_all() when you meant to call find()?"""
== str(e.value)
)
def test_len(self):
# The length of a ResultSet is the length of its result sequence.
rs = ResultSet(None, [1,2,3])
assert len(rs) == 3
def test_getitem(self):
# __getitem__ is delegated to the result sequence.
rs = ResultSet(None, [1,2,3])
assert rs[1] == 2
def test_equality(self):
# A ResultSet is equal to a list if its result sequence is equal to that list.
l = [1, 2, 3]
rs1 = ResultSet(None, [1,2,3])
assert l == rs1
assert l != (1,2,3)
rs2 = ResultSet(None, [1,2])
assert l != rs2
# A ResultSet is equal to another ResultSet if their results are equal
assert rs1 == rs1
assert rs1 != rs2
# Even if the results come from two different sources, the ResultSets are equal.
assert ResultSet(ElementFilter(), [1,2,3]) == rs1

View File

@@ -16,7 +16,7 @@ with examples. I show you what the library is good for, how it works,
how to use it, how to make it do what you want, and what to do when it
violates your expectations.
This document covers Beautiful Soup version 4.13.5. The examples in
This document covers Beautiful Soup version 4.14.0. The examples in
this documentation were written for Python 3.8.
You might be looking for the documentation for `Beautiful Soup 3

View File

@@ -81,7 +81,7 @@ include = [
# Scripts.
"/test-all-versions",
"/scripts/*.py",
"/scripts/demonstrate_parser_differences.py",
# Documentation source in various languages.
"/doc*/Makefile",

View File

@@ -0,0 +1,63 @@
# This script demonstrates that the result of a find* method can
# generally be assigned to either Tag, NavigableString, Sequence[Tag],
# or Sequence[NavigableString], depending on usage.
from typing import Optional, Sequence
from bs4 import BeautifulSoup, Tag, NavigableString
soup = BeautifulSoup("<p>", 'html.parser')
tag:Optional[Tag]
string:Optional[NavigableString]
tags:Sequence[Tag]
strings:Sequence[NavigableString]
tag = soup.find()
tag = soup.find(id="a")
string = soup.find(string="b")
tags = soup()
tags = soup(id="a")
strings = soup(string="b")
tags = soup.find_all()
tags = soup.find_all(id="a")
strings = soup.find_all(string="b")
tag = soup.find_next()
tag = soup.find_next(id="a")
string = soup.find_next(string="b")
tags = soup.find_all_next()
tags = soup.find_all_next(id="a")
strings = soup.find_all_next(string="b")
tag = soup.find_next_sibling()
tag = soup.find_next_sibling(id="a")
string = soup.find_next_sibling(string="b")
tags = soup.find_next_siblings()
tags = soup.find_next_siblings(id="a")
strings = soup.find_next_siblings(string="b")
tag = soup.find_previous()
tag = soup.find_previous(id="a")
string = soup.find_previous(string="b")
tags = soup.find_all_previous()
tags = soup.find_all_previous(id="a")
strings = soup.find_all_previous(string="b")
tag = soup.find_previous_sibling()
tag = soup.find_previous_sibling(id="a")
string = soup.find_previous_sibling(string="bold")
tags = soup.find_previous_siblings()
tags = soup.find_previous_siblings(id="a")
strings = soup.find_previous_siblings(string="b")
tag = soup.find_parent()
tag = soup.find_parent(id="a")
tags = soup.find_parents()
tags = soup.find_parents(id="a")
# This code will work, but mypy and pyright will both flag it.
tags = soup.find_all("a", string="b")