mirror of
https://git.launchpad.net/beautifulsoup
synced 2025-10-06 00:12:49 +02:00
Compare commits
36 Commits
70c7473e6c
...
c7a09cafe5
Author | SHA1 | Date | |
---|---|---|---|
|
c7a09cafe5 | ||
|
9a7cad9a6d | ||
|
5fc1a2b87d | ||
|
d2286be163 | ||
|
6ea6df0753 | ||
|
e34c6fc6b9 | ||
|
bf17d951d2 | ||
|
cd548aaed4 | ||
|
f58228ff0b | ||
|
222984beae | ||
|
da76f83d8a | ||
|
a7cb7745f7 | ||
|
f723296e22 | ||
|
f320fa4fce | ||
|
23c15c2fb1 | ||
|
9bae5a1201 | ||
|
4c5cefec13 | ||
|
0e3199db71 | ||
|
d225e021b9 | ||
|
94408c845d | ||
|
7c06afd400 | ||
|
c1c6162581 | ||
|
4f44b052a1 | ||
|
cd24e0084b | ||
|
7f4b643e98 | ||
|
62566d8d48 | ||
|
6ba889aa56 | ||
|
c909c29d12 | ||
|
977ec996fe | ||
|
57e5074727 | ||
|
b06c53ddc1 | ||
|
951992740d | ||
|
1d51b7ce0d | ||
|
41280f027f | ||
|
d17a44f321 | ||
|
4e69cb357c |
95
CHANGELOG
95
CHANGELOG
@@ -1,9 +1,102 @@
|
||||
= Unreleased
|
||||
= 4.14.0 (20250927)
|
||||
|
||||
* This version adds function overloading to the find_* methods to make
|
||||
it easier to write type-safe Python.
|
||||
|
||||
In most cases you can just assign the result of a find() or
|
||||
find_all() call to the type of object you're expecting to get back:
|
||||
a Tag, a NavigableString, a Sequence[Tag], or a
|
||||
Sequence[NavigableString]. It's very rare that you'll have to do a
|
||||
cast or suppress type-checker warnings like you did in previous
|
||||
versions of Beautiful Soup.
|
||||
|
||||
(In fact, the only time you should still have to do this is if you
|
||||
pass both 'string' and one of the other arguments into one of the
|
||||
find* methods, e.g. tag.find("a", string="tag contents".)
|
||||
|
||||
The following code has been verified to pass type checking using
|
||||
mypy, pyright, and the Visual Studio Code IDE. It's available in
|
||||
the source repository as scripts/type_checking_smoke_test.py.
|
||||
|
||||
---
|
||||
from typing import Optional, Sequence
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
soup = BeautifulSoup("<p>", 'html.parser')
|
||||
|
||||
tag:Optional[Tag]
|
||||
string:Optional[NavigableString]
|
||||
tags:Sequence[Tag]
|
||||
strings:Sequence[NavigableString]
|
||||
|
||||
tag = soup.find()
|
||||
tag = soup.find(id="a")
|
||||
string = soup.find(string="b")
|
||||
|
||||
tags = soup()
|
||||
tags = soup(id="a")
|
||||
strings = soup(string="b")
|
||||
|
||||
tags = soup.find_all()
|
||||
tags = soup.find_all(id="a")
|
||||
strings = soup.find_all(string="b")
|
||||
|
||||
tag = soup.find_next()
|
||||
tag = soup.find_next(id="a")
|
||||
string = soup.find_next(string="b")
|
||||
|
||||
tags = soup.find_all_next()
|
||||
tags = soup.find_all_next(id="a")
|
||||
strings = soup.find_all_next(string="b")
|
||||
|
||||
tag = soup.find_next_sibling()
|
||||
tag = soup.find_next_sibling(id="a")
|
||||
string = soup.find_next_sibling(string="b")
|
||||
|
||||
tags = soup.find_next_siblings()
|
||||
tags = soup.find_next_siblings(id="a")
|
||||
strings = soup.find_next_siblings(string="b")
|
||||
|
||||
tag = soup.find_previous()
|
||||
tag = soup.find_previous(id="a")
|
||||
string = soup.find_previous(string="b")
|
||||
|
||||
tags = soup.find_all_previous()
|
||||
tags = soup.find_all_previous(id="a")
|
||||
strings = soup.find_all_previous(string="b")
|
||||
|
||||
tag = soup.find_previous_sibling()
|
||||
tag = soup.find_previous_sibling(id="a")
|
||||
string = soup.find_previous_sibling(string="bold")
|
||||
|
||||
tags = soup.find_previous_siblings()
|
||||
tags = soup.find_previous_siblings(id="a")
|
||||
strings = soup.find_previous_siblings(string="b")
|
||||
|
||||
tag = soup.find_parent()
|
||||
tag = soup.find_parent(id="a")
|
||||
tags = soup.find_parents()
|
||||
tags = soup.find_parents(id="a")
|
||||
|
||||
# This code will work, but mypy and pyright will both flag it.
|
||||
tags = soup.find_all("a", string="b")
|
||||
---
|
||||
|
||||
* The typing for find_parent() and find_parents() was improved without
|
||||
any overloading. Casts should never be necessary, since those
|
||||
methods only ever return Tag and ResultSet[Tag], respectively.
|
||||
|
||||
* ResultSet now inherits from Sequence. This should make it easier to
|
||||
incorporate ResultSet objects into your type system without needing to
|
||||
handle ResultSet specially.
|
||||
|
||||
* Fixed an unhandled exception when creating the string representation of
|
||||
a decomposed element. (The output is not *useful* and you still
|
||||
shouldn't do this, but it won't raise an exception anymore.) [bug=2120300]
|
||||
|
||||
* The default value for the 'attrs' attribute in find* methods is now
|
||||
None, not the empty dictionary. This should have no visible effect
|
||||
on anything.
|
||||
|
||||
= 4.13.5 (20250824)
|
||||
|
||||
* Fixed an unhandled exception when parsing invalid markup that contains the { character
|
||||
|
@@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.13.5"
|
||||
__version__ = "4.14.0"
|
||||
__copyright__ = "Copyright (c) 2004-2025 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
@@ -198,4 +198,8 @@ _StrainableAttributes: TypeAlias = Dict[str, _StrainableAttribute]
|
||||
#: are available on the objects they're dealing with.
|
||||
_OneElement: TypeAlias = Union["PageElement", "Tag", "NavigableString"]
|
||||
_AtMostOneElement: TypeAlias = Optional[_OneElement]
|
||||
_AtMostOneTag: TypeAlias = Optional["Tag"]
|
||||
_AtMostOneNavigableString: TypeAlias = Optional["NavigableString"]
|
||||
_QueryResults: TypeAlias = "ResultSet[_OneElement]"
|
||||
_SomeTags: TypeAlias = "ResultSet[Tag]"
|
||||
_SomeNavigableStrings: TypeAlias = "ResultSet[NavigableString]"
|
||||
|
@@ -20,6 +20,7 @@ from typing import (
|
||||
cast,
|
||||
Iterable,
|
||||
Iterator,
|
||||
Sequence,
|
||||
Optional,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
@@ -88,7 +89,7 @@ class CSS(object):
|
||||
ns = self.tag._namespaces
|
||||
return ns
|
||||
|
||||
def _rs(self, results: Iterable[Tag]) -> ResultSet[Tag]:
|
||||
def _rs(self, results: Sequence[Tag]) -> ResultSet[Tag]:
|
||||
"""Normalize a list of results to a py:class:`ResultSet`.
|
||||
|
||||
A py:class:`ResultSet` is more consistent with the rest of
|
||||
|
412
bs4/element.py
412
bs4/element.py
@@ -30,6 +30,7 @@ from typing import (
|
||||
Mapping,
|
||||
Optional,
|
||||
Pattern,
|
||||
Sequence,
|
||||
Set,
|
||||
TYPE_CHECKING,
|
||||
Tuple,
|
||||
@@ -54,6 +55,8 @@ if TYPE_CHECKING:
|
||||
)
|
||||
from bs4._typing import (
|
||||
_AtMostOneElement,
|
||||
_AtMostOneTag,
|
||||
_AtMostOneNavigableString,
|
||||
_AttributeValue,
|
||||
_AttributeValues,
|
||||
_Encoding,
|
||||
@@ -65,6 +68,8 @@ if TYPE_CHECKING:
|
||||
_StrainableAttribute,
|
||||
_StrainableAttributes,
|
||||
_StrainableString,
|
||||
_SomeNavigableStrings,
|
||||
_SomeTags,
|
||||
)
|
||||
|
||||
_OneOrMoreStringTypes: TypeAlias = Union[
|
||||
@@ -746,13 +751,35 @@ class PageElement(object):
|
||||
|
||||
return results
|
||||
|
||||
# For the suppression of this pyright warning, see discussion here:
|
||||
# https://github.com/microsoft/pyright/issues/10929
|
||||
@overload
|
||||
def find_next( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None=None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneTag:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_next(
|
||||
self,
|
||||
name: None=None,
|
||||
attrs: None=None,
|
||||
string: _StrainableString="",
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneNavigableString:
|
||||
...
|
||||
|
||||
def find_next(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
|
||||
"""Find the first PageElement that matches the given criteria and
|
||||
appears later in the document than this PageElement.
|
||||
|
||||
@@ -768,15 +795,39 @@ class PageElement(object):
|
||||
|
||||
findNext = _deprecated_function_alias("findNext", "find_next", "4.0.0")
|
||||
|
||||
@overload
|
||||
def find_all_next( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeTags:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_all_next(
|
||||
self,
|
||||
name: None = None,
|
||||
attrs: None = None,
|
||||
string: _StrainableString = "",
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeNavigableStrings:
|
||||
...
|
||||
|
||||
def find_all_next(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
|
||||
"""Find all `PageElement` objects that match the given criteria and
|
||||
appear later in the document than this `PageElement`.
|
||||
|
||||
@@ -802,13 +853,33 @@ class PageElement(object):
|
||||
|
||||
findAllNext = _deprecated_function_alias("findAllNext", "find_all_next", "4.0.0")
|
||||
|
||||
@overload
|
||||
def find_next_sibling( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None=None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneTag:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_next_sibling(
|
||||
self,
|
||||
name: None=None,
|
||||
attrs: None=None,
|
||||
string: _StrainableString="",
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneNavigableString:
|
||||
...
|
||||
|
||||
def find_next_sibling(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
|
||||
"""Find the closest sibling to this PageElement that matches the
|
||||
given criteria and appears later in the document.
|
||||
|
||||
@@ -826,15 +897,39 @@ class PageElement(object):
|
||||
"findNextSibling", "find_next_sibling", "4.0.0"
|
||||
)
|
||||
|
||||
@overload
|
||||
def find_next_siblings( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeTags:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_next_siblings(
|
||||
self,
|
||||
name: None = None,
|
||||
attrs: None = None,
|
||||
string: _StrainableString = "",
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeNavigableStrings:
|
||||
...
|
||||
|
||||
def find_next_siblings(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
|
||||
"""Find all siblings of this `PageElement` that match the given criteria
|
||||
and appear later in the document.
|
||||
|
||||
@@ -865,13 +960,33 @@ class PageElement(object):
|
||||
"fetchNextSiblings", "find_next_siblings", "3.0.0"
|
||||
)
|
||||
|
||||
@overload
|
||||
def find_previous( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None=None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneTag:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_previous(
|
||||
self,
|
||||
name: None=None,
|
||||
attrs: None=None,
|
||||
string: _StrainableString="",
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneNavigableString:
|
||||
...
|
||||
|
||||
def find_previous(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
|
||||
"""Look backwards in the document from this `PageElement` and find the
|
||||
first `PageElement` that matches the given criteria.
|
||||
|
||||
@@ -887,15 +1002,39 @@ class PageElement(object):
|
||||
|
||||
findPrevious = _deprecated_function_alias("findPrevious", "find_previous", "3.0.0")
|
||||
|
||||
@overload
|
||||
def find_all_previous( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeTags:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_all_previous(
|
||||
self,
|
||||
name: None = None,
|
||||
attrs: None = None,
|
||||
string: _StrainableString = "",
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeNavigableStrings:
|
||||
...
|
||||
|
||||
def find_all_previous(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
|
||||
"""Look backwards in the document from this `PageElement` and find all
|
||||
`PageElement` that match the given criteria.
|
||||
|
||||
@@ -926,13 +1065,33 @@ class PageElement(object):
|
||||
"fetchAllPrevious", "find_all_previous", "3.0.0"
|
||||
)
|
||||
|
||||
@overload
|
||||
def find_previous_sibling( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None=None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneTag:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_previous_sibling(
|
||||
self,
|
||||
name: None=None,
|
||||
attrs: None=None,
|
||||
string: _StrainableString="",
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneNavigableString:
|
||||
...
|
||||
|
||||
def find_previous_sibling(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
|
||||
"""Returns the closest sibling to this `PageElement` that matches the
|
||||
given criteria and appears earlier in the document.
|
||||
|
||||
@@ -952,15 +1111,39 @@ class PageElement(object):
|
||||
"findPreviousSibling", "find_previous_sibling", "4.0.0"
|
||||
)
|
||||
|
||||
@overload
|
||||
def find_previous_siblings( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: None = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeTags:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_previous_siblings(
|
||||
self,
|
||||
name: None = None,
|
||||
attrs: None = None,
|
||||
string: _StrainableString = "",
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeNavigableStrings:
|
||||
...
|
||||
|
||||
def find_previous_siblings(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
|
||||
"""Returns all siblings to this PageElement that match the
|
||||
given criteria and appear earlier in the document.
|
||||
|
||||
@@ -994,9 +1177,9 @@ class PageElement(object):
|
||||
def find_parent(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
) -> _AtMostOneTag:
|
||||
"""Find the closest parent of this PageElement that matches the given
|
||||
criteria.
|
||||
|
||||
@@ -1024,11 +1207,11 @@ class PageElement(object):
|
||||
def find_parents(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> _SomeTags:
|
||||
"""Find all parents of this `PageElement` that match the given criteria.
|
||||
|
||||
All find_* methods take a common set of arguments. See the online
|
||||
@@ -1041,9 +1224,11 @@ class PageElement(object):
|
||||
:kwargs: Additional filters on attribute values.
|
||||
"""
|
||||
iterator = self.parents
|
||||
return self._find_all(
|
||||
# Only Tags can have children, so this ResultSet will contain
|
||||
# nothing but Tags.
|
||||
return cast(ResultSet[Tag], self._find_all(
|
||||
name, attrs, None, limit, iterator, _stacklevel=_stacklevel + 1, **kwargs
|
||||
)
|
||||
))
|
||||
|
||||
findParents = _deprecated_function_alias("findParents", "find_parents", "4.0.0")
|
||||
fetchParents = _deprecated_function_alias("fetchParents", "find_parents", "3.0.0")
|
||||
@@ -1068,7 +1253,7 @@ class PageElement(object):
|
||||
# specific here.
|
||||
method: Callable,
|
||||
name: _FindMethodName,
|
||||
attrs: _StrainableAttributes,
|
||||
attrs: Optional[_StrainableAttributes],
|
||||
string: Optional[_StrainableString],
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
@@ -1081,7 +1266,7 @@ class PageElement(object):
|
||||
def _find_all(
|
||||
self,
|
||||
name: _FindMethodName,
|
||||
attrs: _StrainableAttributes,
|
||||
attrs: Optional[_StrainableAttributes],
|
||||
string: Optional[_StrainableString],
|
||||
limit: Optional[int],
|
||||
generator: Iterator[PageElement],
|
||||
@@ -1120,7 +1305,7 @@ class PageElement(object):
|
||||
if string is None and not limit and not attrs and not kwargs:
|
||||
if name is True or name is None:
|
||||
# Optimization to find all tags.
|
||||
result = (element for element in generator if isinstance(element, Tag))
|
||||
result = [element for element in generator if isinstance(element, Tag)]
|
||||
return ResultSet(matcher, result)
|
||||
elif isinstance(name, str):
|
||||
# Optimization to find all tags with a given name.
|
||||
@@ -2239,22 +2424,63 @@ class Tag(PageElement):
|
||||
"Deleting tag[key] deletes all 'key' attributes for the tag."
|
||||
self.attrs.pop(key, None)
|
||||
|
||||
@overload
|
||||
def __call__( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
recursive: bool = True,
|
||||
string: None = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeTags:
|
||||
...
|
||||
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
name: Optional[_StrainableElement] = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
name: None = None,
|
||||
attrs: None = None,
|
||||
recursive: bool = True,
|
||||
string: _StrainableString = "",
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeNavigableStrings:
|
||||
...
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
recursive: bool = True,
|
||||
string: Optional[_StrainableString] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
|
||||
"""Calling a Tag like a function is the same as calling its
|
||||
find_all() method. Eg. tag('a') returns a list of all the A tags
|
||||
found within this tag."""
|
||||
return self.find_all(
|
||||
name, attrs, recursive, string, limit, _stacklevel, **kwargs
|
||||
if string is not None and (name is not None or attrs is not None or kwargs):
|
||||
# TODO: Using the @overload decorator to express the three ways you
|
||||
# could get into this path is way too much code for a rarely(?) used
|
||||
# feature.
|
||||
return cast(ResultSet[Tag], self.find_all(name, attrs, recursive, string, limit, _stacklevel, **kwargs)) #type: ignore
|
||||
|
||||
if string is None:
|
||||
# If string is None, we're searching for tags.
|
||||
tags:ResultSet[Tag] = self.find_all(
|
||||
name, attrs, recursive, None, limit, _stacklevel, **kwargs
|
||||
)
|
||||
return tags
|
||||
|
||||
# Otherwise, we're searching for strings.
|
||||
strings:ResultSet[NavigableString] = self.find_all(
|
||||
None, None, recursive, string, limit, _stacklevel, **kwargs
|
||||
)
|
||||
return strings
|
||||
|
||||
def __getattr__(self, subtag: str) -> Optional[Tag]:
|
||||
"""Calling tag.subtag is the same as calling tag.find(name="subtag")"""
|
||||
@@ -2277,7 +2503,7 @@ class Tag(PageElement):
|
||||
raise AttributeError(
|
||||
"'%s' object has no attribute '%s'" % (self.__class__, subtag)
|
||||
)
|
||||
return cast(Optional[Tag], result)
|
||||
return result
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
"""Returns true iff this Tag has the same name, the same attributes,
|
||||
@@ -2707,14 +2933,35 @@ class Tag(PageElement):
|
||||
|
||||
# Soup methods
|
||||
|
||||
@overload
|
||||
def find(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
recursive: bool = True,
|
||||
string: None=None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneTag:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find(
|
||||
self,
|
||||
name: None=None,
|
||||
attrs: None=None,
|
||||
recursive: bool = True,
|
||||
string: _StrainableString="",
|
||||
) -> _AtMostOneNavigableString:
|
||||
...
|
||||
|
||||
def find(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
recursive: bool = True,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _AtMostOneElement:
|
||||
) -> Union[_AtMostOneTag,_AtMostOneNavigableString,_AtMostOneElement]:
|
||||
"""Look in the children of this PageElement and find the first
|
||||
PageElement that matches the given criteria.
|
||||
|
||||
@@ -2727,27 +2974,63 @@ class Tag(PageElement):
|
||||
recursive search of this Tag's children. Otherwise,
|
||||
only the direct children will be considered.
|
||||
:param string: A filter on the `Tag.string` attribute.
|
||||
:param limit: Stop looking after finding this many results.
|
||||
:kwargs: Additional filters on attribute values.
|
||||
"""
|
||||
r = None
|
||||
results = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs)
|
||||
if results:
|
||||
r = results[0]
|
||||
return r
|
||||
if string is not None and (name is not None or attrs is not None or kwargs):
|
||||
# TODO: Using the @overload decorator to express the three ways you
|
||||
# could get into this path is way too much code for a rarely(?) used
|
||||
# feature.
|
||||
elements = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, **kwargs) # type:ignore
|
||||
if elements:
|
||||
return cast(Tag, elements[0])
|
||||
elif string is None:
|
||||
tags = self.find_all(name, attrs, recursive, None, 1, _stacklevel=3, **kwargs)
|
||||
if tags:
|
||||
return cast(Tag, tags[0])
|
||||
else:
|
||||
strings = self.find_all(None, None, recursive, string, 1, _stacklevel=3, **kwargs)
|
||||
if strings:
|
||||
return cast(NavigableString, strings[0])
|
||||
return None
|
||||
|
||||
findChild = _deprecated_function_alias("findChild", "find", "3.0.0")
|
||||
|
||||
@overload
|
||||
def find_all( # pyright: ignore [reportOverlappingOverload]
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
recursive: bool = True,
|
||||
string: None = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeTags:
|
||||
...
|
||||
|
||||
@overload
|
||||
def find_all(
|
||||
self,
|
||||
name: None = None,
|
||||
attrs: None = None,
|
||||
recursive: bool = True,
|
||||
string: _StrainableString = "",
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _SomeNavigableStrings:
|
||||
...
|
||||
|
||||
def find_all(
|
||||
self,
|
||||
name: _FindMethodName = None,
|
||||
attrs: _StrainableAttributes = {},
|
||||
attrs: Optional[_StrainableAttributes] = None,
|
||||
recursive: bool = True,
|
||||
string: Optional[_StrainableString] = None,
|
||||
limit: Optional[int] = None,
|
||||
_stacklevel: int = 2,
|
||||
**kwargs: _StrainableAttribute,
|
||||
) -> _QueryResults:
|
||||
) -> Union[_SomeTags,_SomeNavigableStrings,_QueryResults]:
|
||||
"""Look in the children of this `PageElement` and find all
|
||||
`PageElement` objects that match the given criteria.
|
||||
|
||||
@@ -2766,9 +3049,27 @@ class Tag(PageElement):
|
||||
generator = self.descendants
|
||||
if not recursive:
|
||||
generator = self.children
|
||||
return self._find_all(
|
||||
name, attrs, string, limit, generator, _stacklevel=_stacklevel + 1, **kwargs
|
||||
)
|
||||
_stacklevel += 1
|
||||
|
||||
if string is not None and (name is not None or attrs is not None or kwargs):
|
||||
# TODO: Using the @overload decorator to express the three ways you
|
||||
# could get into this path is way too much code for a rarely(?) used
|
||||
# feature.
|
||||
return cast(ResultSet[Tag],
|
||||
self._find_all(name, attrs, string, limit, generator,
|
||||
_stacklevel=_stacklevel, **kwargs)
|
||||
)
|
||||
|
||||
if string is None:
|
||||
# If string is None, we're searching for tags.
|
||||
return cast(ResultSet[Tag], self._find_all(
|
||||
name, attrs, None, limit, generator, _stacklevel=_stacklevel, **kwargs
|
||||
))
|
||||
|
||||
# Otherwise, we're searching for strings.
|
||||
return cast(ResultSet[NavigableString], self._find_all(
|
||||
None, None, string, limit, generator, _stacklevel=_stacklevel, **kwargs
|
||||
))
|
||||
|
||||
findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
|
||||
findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
|
||||
@@ -2884,26 +3185,39 @@ class Tag(PageElement):
|
||||
_PageElementT = TypeVar("_PageElementT", bound=PageElement)
|
||||
|
||||
|
||||
class ResultSet(List[_PageElementT], Generic[_PageElementT]):
|
||||
"""A ResultSet is a list of `PageElement` objects, gathered as the result
|
||||
class ResultSet(Sequence[_PageElementT], Generic[_PageElementT]):
|
||||
"""A ResultSet is a sequence of `PageElement` objects, gathered as the result
|
||||
of matching an :py:class:`ElementFilter` against a parse tree. Basically, a list of
|
||||
search results.
|
||||
"""
|
||||
|
||||
source: Optional[ElementFilter]
|
||||
result: Sequence[_PageElementT]
|
||||
|
||||
def __init__(
|
||||
self, source: Optional[ElementFilter], result: Iterable[_PageElementT] = ()
|
||||
self, source: Optional[ElementFilter], result: Sequence[_PageElementT] = ()
|
||||
) -> None:
|
||||
super(ResultSet, self).__init__(result)
|
||||
self.result = result
|
||||
self.source = source
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.result)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.result[index]
|
||||
|
||||
def __getattr__(self, key: str) -> None:
|
||||
"""Raise a helpful exception to explain a common code fix."""
|
||||
raise AttributeError(
|
||||
f"""ResultSet object has no attribute "{key}". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
|
||||
f"""ResultSet object has no attribute "{key}". You're probably treating a sequence of elements like a single element. Did you call find_all() when you meant to call find()?"""
|
||||
)
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
"""A ResultSet is equal to a list if its results are equal to that list.
|
||||
A ResultSet is equal to another ResultSet if their results are equal,
|
||||
even if the results come from different sources.
|
||||
"""
|
||||
return bool(self.result == other)
|
||||
|
||||
# Now that all the classes used by SoupStrainer have been defined,
|
||||
# import SoupStrainer itself into this module to preserve the
|
||||
|
@@ -136,8 +136,7 @@ class ElementFilter(object):
|
||||
# If there are no rules at all, don't bother filtering. Let
|
||||
# anything through.
|
||||
if self.includes_everything:
|
||||
for i in generator:
|
||||
yield i
|
||||
yield from generator
|
||||
while True:
|
||||
try:
|
||||
i = next(generator)
|
||||
@@ -175,12 +174,12 @@ class ElementFilter(object):
|
||||
|
||||
:param limit: Stop looking after finding this many results.
|
||||
"""
|
||||
results: _QueryResults = ResultSet(self)
|
||||
results = []
|
||||
for match in self.filter(generator):
|
||||
results.append(match)
|
||||
if limit is not None and len(results) >= limit:
|
||||
break
|
||||
return results
|
||||
return ResultSet(self, results)
|
||||
|
||||
def allow_tag_creation(
|
||||
self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues]
|
||||
@@ -379,7 +378,7 @@ class SoupStrainer(ElementFilter):
|
||||
def __init__(
|
||||
self,
|
||||
name: Optional[_StrainableElement] = None,
|
||||
attrs: Dict[str, _StrainableAttribute] = {},
|
||||
attrs: Optional[Dict[str, _StrainableAttribute]] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
):
|
||||
@@ -397,11 +396,13 @@ class SoupStrainer(ElementFilter):
|
||||
# that matches all Tags, and only Tags.
|
||||
self.name_rules = [TagNameMatchRule(present=True)]
|
||||
else:
|
||||
self.name_rules = cast(
|
||||
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
|
||||
)
|
||||
self.name_rules = cast(
|
||||
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
|
||||
)
|
||||
self.attribute_rules = defaultdict(list)
|
||||
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
if not isinstance(attrs, dict):
|
||||
# Passing something other than a dictionary as attrs is
|
||||
# sugar for matching that thing against the 'class'
|
||||
|
@@ -13,6 +13,7 @@ from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
ResultSet,
|
||||
)
|
||||
from bs4.filter import ElementFilter
|
||||
|
||||
class TestNamedspacedAttribute:
|
||||
def test_name_may_be_none_or_missing(self):
|
||||
@@ -133,6 +134,33 @@ class TestResultSet:
|
||||
with pytest.raises(AttributeError) as e:
|
||||
rs.name
|
||||
assert (
|
||||
"""ResultSet object has no attribute "name". You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?"""
|
||||
"""ResultSet object has no attribute "name". You're probably treating a sequence of elements like a single element. Did you call find_all() when you meant to call find()?"""
|
||||
== str(e.value)
|
||||
)
|
||||
|
||||
def test_len(self):
|
||||
# The length of a ResultSet is the length of its result sequence.
|
||||
rs = ResultSet(None, [1,2,3])
|
||||
assert len(rs) == 3
|
||||
|
||||
def test_getitem(self):
|
||||
# __getitem__ is delegated to the result sequence.
|
||||
rs = ResultSet(None, [1,2,3])
|
||||
assert rs[1] == 2
|
||||
|
||||
def test_equality(self):
|
||||
# A ResultSet is equal to a list if its result sequence is equal to that list.
|
||||
l = [1, 2, 3]
|
||||
rs1 = ResultSet(None, [1,2,3])
|
||||
assert l == rs1
|
||||
assert l != (1,2,3)
|
||||
|
||||
rs2 = ResultSet(None, [1,2])
|
||||
assert l != rs2
|
||||
|
||||
# A ResultSet is equal to another ResultSet if their results are equal
|
||||
assert rs1 == rs1
|
||||
assert rs1 != rs2
|
||||
|
||||
# Even if the results come from two different sources, the ResultSets are equal.
|
||||
assert ResultSet(ElementFilter(), [1,2,3]) == rs1
|
||||
|
@@ -16,7 +16,7 @@ with examples. I show you what the library is good for, how it works,
|
||||
how to use it, how to make it do what you want, and what to do when it
|
||||
violates your expectations.
|
||||
|
||||
This document covers Beautiful Soup version 4.13.5. The examples in
|
||||
This document covers Beautiful Soup version 4.14.0. The examples in
|
||||
this documentation were written for Python 3.8.
|
||||
|
||||
You might be looking for the documentation for `Beautiful Soup 3
|
||||
|
@@ -81,7 +81,7 @@ include = [
|
||||
|
||||
# Scripts.
|
||||
"/test-all-versions",
|
||||
"/scripts/*.py",
|
||||
"/scripts/demonstrate_parser_differences.py",
|
||||
|
||||
# Documentation source in various languages.
|
||||
"/doc*/Makefile",
|
||||
|
63
scripts/type_checking_smoke_test.py
Normal file
63
scripts/type_checking_smoke_test.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# This script demonstrates that the result of a find* method can
|
||||
# generally be assigned to either Tag, NavigableString, Sequence[Tag],
|
||||
# or Sequence[NavigableString], depending on usage.
|
||||
from typing import Optional, Sequence
|
||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||
soup = BeautifulSoup("<p>", 'html.parser')
|
||||
|
||||
tag:Optional[Tag]
|
||||
string:Optional[NavigableString]
|
||||
tags:Sequence[Tag]
|
||||
strings:Sequence[NavigableString]
|
||||
|
||||
tag = soup.find()
|
||||
tag = soup.find(id="a")
|
||||
string = soup.find(string="b")
|
||||
|
||||
tags = soup()
|
||||
tags = soup(id="a")
|
||||
strings = soup(string="b")
|
||||
|
||||
tags = soup.find_all()
|
||||
tags = soup.find_all(id="a")
|
||||
strings = soup.find_all(string="b")
|
||||
|
||||
tag = soup.find_next()
|
||||
tag = soup.find_next(id="a")
|
||||
string = soup.find_next(string="b")
|
||||
|
||||
tags = soup.find_all_next()
|
||||
tags = soup.find_all_next(id="a")
|
||||
strings = soup.find_all_next(string="b")
|
||||
|
||||
tag = soup.find_next_sibling()
|
||||
tag = soup.find_next_sibling(id="a")
|
||||
string = soup.find_next_sibling(string="b")
|
||||
|
||||
tags = soup.find_next_siblings()
|
||||
tags = soup.find_next_siblings(id="a")
|
||||
strings = soup.find_next_siblings(string="b")
|
||||
|
||||
tag = soup.find_previous()
|
||||
tag = soup.find_previous(id="a")
|
||||
string = soup.find_previous(string="b")
|
||||
|
||||
tags = soup.find_all_previous()
|
||||
tags = soup.find_all_previous(id="a")
|
||||
strings = soup.find_all_previous(string="b")
|
||||
|
||||
tag = soup.find_previous_sibling()
|
||||
tag = soup.find_previous_sibling(id="a")
|
||||
string = soup.find_previous_sibling(string="bold")
|
||||
|
||||
tags = soup.find_previous_siblings()
|
||||
tags = soup.find_previous_siblings(id="a")
|
||||
strings = soup.find_previous_siblings(string="b")
|
||||
|
||||
tag = soup.find_parent()
|
||||
tag = soup.find_parent(id="a")
|
||||
tags = soup.find_parents()
|
||||
tags = soup.find_parents(id="a")
|
||||
|
||||
# This code will work, but mypy and pyright will both flag it.
|
||||
tags = soup.find_all("a", string="b")
|
Reference in New Issue
Block a user