1
1
mirror of https://git.launchpad.net/beautifulsoup synced 2025-10-06 00:12:49 +02:00

Compare commits

...

5 Commits

Author SHA1 Message Date
Leonard Richardson
0d9dcfc09b Fixed some doc references. 2025-08-23 12:58:46 -04:00
Leonard Richardson
540ac603b0 Candidate implementation with doc changes. 2025-08-23 12:56:01 -04:00
Leonard Richardson
6752da5b7a Candidate implementation with doc changes. 2025-08-23 12:51:33 -04:00
Leonard Richardson
ad14c8783a attrs is always optional. 2025-08-10 18:20:06 -04:00
Leonard Richardson
f4137ba3f9 attrs is always optional. 2025-08-10 18:15:59 -04:00
4 changed files with 319 additions and 109 deletions

View File

@@ -199,6 +199,7 @@ _StrainableAttributes: TypeAlias = Dict[str, _StrainableAttribute]
#: are available on the objects they're dealing with.
_OneElement: TypeAlias = Union["PageElement", "Tag", "NavigableString"]
_AtMostOneElement: TypeAlias = Optional[_OneElement]
_AtMostOneTag: TypeAlias = Optional["Tag"]
_QueryResults: TypeAlias = "ResultSet[_OneElement]"
_TagOrGenerator: TypeAlias = Union["Tag", Iterator["PageElement"]]

View File

@@ -54,6 +54,7 @@ if TYPE_CHECKING:
)
from bs4._typing import (
_AtMostOneElement,
_AtMostOneTag,
_AttributeValue,
_AttributeValues,
_Encoding,
@@ -748,7 +749,7 @@ class PageElement(object):
def find_next(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
@@ -770,7 +771,7 @@ class PageElement(object):
def find_all_next(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
@@ -804,7 +805,7 @@ class PageElement(object):
def find_next_sibling(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
@@ -828,7 +829,7 @@ class PageElement(object):
def find_next_siblings(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
@@ -867,7 +868,7 @@ class PageElement(object):
def find_previous(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
@@ -889,7 +890,7 @@ class PageElement(object):
def find_all_previous(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
@@ -928,7 +929,7 @@ class PageElement(object):
def find_previous_sibling(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
@@ -954,7 +955,7 @@ class PageElement(object):
def find_previous_siblings(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
_stacklevel: int = 2,
@@ -993,7 +994,7 @@ class PageElement(object):
def find_parent(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
"""Find the closest parent of this PageElement that matches the given
@@ -1023,7 +1024,7 @@ class PageElement(object):
def find_parents(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
limit: Optional[int] = None,
_stacklevel: int = 2,
**kwargs: _StrainableAttribute,
@@ -1067,7 +1068,7 @@ class PageElement(object):
# specific here.
method: Callable,
name: _FindMethodName,
attrs: _StrainableAttributes,
attrs: Optional[_StrainableAttributes],
string: Optional[_StrainableString],
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
@@ -1080,7 +1081,7 @@ class PageElement(object):
def _find_all(
self,
name: _FindMethodName,
attrs: _StrainableAttributes,
attrs: Optional[_StrainableAttributes],
string: Optional[_StrainableString],
limit: Optional[int],
generator: Iterator[PageElement],
@@ -1113,7 +1114,7 @@ class PageElement(object):
if isinstance(name, ElementFilter):
matcher = name
else:
matcher = SoupStrainer(name, attrs, string, **kwargs)
matcher = SoupStrainer(name, attrs or {}, string, **kwargs)
result: Iterable[_OneElement]
if string is None and not limit and not attrs and not kwargs:
@@ -2241,7 +2242,7 @@ class Tag(PageElement):
def __call__(
self,
name: Optional[_StrainableElement] = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
recursive: bool = True,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
@@ -2706,10 +2707,32 @@ class Tag(PageElement):
# Soup methods
@overload
def find(
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: None=None,
**kwargs: _StrainableAttribute,
) -> _AtMostOneTag:
...
@overload
def find(
self,
name: _FindMethodName = None,
attrs: Optional[_StrainableAttributes] = None,
recursive: bool = True,
string: _StrainableString="",
**kwargs: _StrainableAttribute,
) -> _AtMostOneElement:
...
def find(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
recursive: bool = True,
string: Optional[_StrainableString] = None,
**kwargs: _StrainableAttribute,
@@ -2740,7 +2763,7 @@ class Tag(PageElement):
def find_all(
self,
name: _FindMethodName = None,
attrs: _StrainableAttributes = {},
attrs: Optional[_StrainableAttributes] = {},
recursive: bool = True,
string: Optional[_StrainableString] = None,
limit: Optional[int] = None,
@@ -2772,6 +2795,22 @@ class Tag(PageElement):
findAll = _deprecated_function_alias("findAll", "find_all", "4.0.0")
findChildren = _deprecated_function_alias("findChildren", "find_all", "3.0.0")
def find_tag(self, filter_:ElementFilter) -> Optional[Tag]:
"""Find the first `Tag` that matches the given `ElementFilter`."""
return filter_.find_tag(self)
def find_string(self, filter_:ElementFilter) -> Optional[NavigableString]:
"""Find the first `NavigableString` that matches the given `ElementFilter`."""
return filter_.find_string(self)
def find_tags(self, filter_:ElementFilter) -> ResultSet[Tag]:
"""Find all `Tag`s that match the given `ElementFilter`."""
return filter_.find_tags(self)
def find_strings(self, filter_:ElementFilter) -> ResultSet[NavigableString]:
"""Find all `NavigableString`s that match the given `ElementFilter`."""
return filter_.find_strings(self)
# Generator methods
@property
def children(self) -> Iterator[PageElement]:

View File

@@ -140,8 +140,7 @@ class ElementFilter(object):
# If there are no rules at all, don't bother filtering. Let
# anything through.
if self.includes_everything:
for i in generator:
yield i
yield from generator
while True:
try:
i = next(generator)
@@ -160,7 +159,6 @@ class ElementFilter(object):
:param generator: A way of iterating over `PageElement`
objects.
"""
for match in self.filter(generator):
return match
@@ -170,9 +168,10 @@ class ElementFilter(object):
"""Like ElementFilter.find(), but guaranteed to return either a Tag or None.
"""
# NOTE: For this and the other type-safe find_* methods, we
# can't just call out to the non-type-safe method. That method
# might return an object of the wrong type, or hit its limit
# by counting objects that we wouldn't count.
# can't just call out to the non-type-safe method (find(), in
# this case). That method might return an object of the wrong
# type, or hit its limit by counting objects that we wouldn't
# count.
for match in self.filter(generator):
if isinstance(match, Tag):
return match
@@ -209,7 +208,7 @@ class ElementFilter(object):
break
return results
def find_all_tags(self, generator: _TagOrGenerator, limit: Optional[int] = None
def find_tags(self, generator: _TagOrGenerator, limit: Optional[int] = None
) -> ResultSet[Tag]:
"""Like ElementFilter.find_all(), but guaranteed to only match Tag objects.
"""
@@ -221,7 +220,7 @@ class ElementFilter(object):
break
return results
def find_all_strings(self, generator: _TagOrGenerator, limit: Optional[int] = None
def find_strings(self, generator: _TagOrGenerator, limit: Optional[int] = None
) -> ResultSet[NavigableString]:
"""Like ElementFilter.find_all(), but guaranteed to only match NavigableString objects.
"""

View File

@@ -16,7 +16,7 @@ with examples. I show you what the library is good for, how it works,
how to use it, how to make it do what you want, and what to do when it
violates your expectations.
This document covers Beautiful Soup version 4.13.4. The examples in
This document covers Beautiful Soup version 4.14.0. The examples in
this documentation were written for Python 3.8.
You might be looking for the documentation for `Beautiful Soup 3
@@ -64,7 +64,7 @@ Quick Start
Here's an HTML document I'll be using as an example throughout this
document. It's part of a story from *Alice in Wonderland*::
html_doc = """<html><head><title>The Dormouse's story</title></head>
sisters_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
@@ -82,7 +82,7 @@ Running the "three sisters" document through Beautiful Soup gives us a
data structure::
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
soup = BeautifulSoup(sisters_doc, 'html.parser')
print(soup.prettify())
# <html>
@@ -479,7 +479,7 @@ Unicode string. If you don't, your string will carry around a
reference to the entire Beautiful Soup parse tree, even when you're
done using Beautiful Soup. This is a big waste of memory.
.. BeautifulSoup
.. py:class:: BeautifulSoup
---------------------------
@@ -509,6 +509,24 @@ so it's been given the special ``.name`` "[document]"::
soup.name
# '[document]'
.. py:class:: PageElement
-------------------------
You may sometimes see a reference to a class called
:py:class:`PageElement`, especially if you are :ref:`running a type
checker <Type-safe programming>` against your Beautiful Soup
code.
:py:class:`PageElement` is the base class of both
:py:class:`Tag` and :py:class:`NavigableString` (and
:py:class:`BeautifulSoup` itself). Everything that originally came from an XML or HTML document is a
:py:class:`PageElement`. The class has some methods and attributes that are common to both
:py:class:`Tag` and :py:class:`NavigableString`, like
:ref:`.parent <.parent>`. But most of the time, if you
have a :py:class:`PageElement`, you need to figure out whether it's a
:py:class:`Tag` or a :py:class:`NavigableString` before you can use it.
Special strings
---------------
@@ -603,7 +621,7 @@ Navigating the tree
Here's the "Three sisters" HTML document again::
html_doc = """
sisters_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
@@ -618,7 +636,7 @@ Here's the "Three sisters" HTML document again::
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
soup = BeautifulSoup(sisters_doc, 'html.parser')
I'll use this as an example to show you how to move from one part of
a document to another.
@@ -1089,7 +1107,7 @@ them briefly.
Once again, I'll be using the "three sisters" document as an example::
html_doc = """
sisters_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
@@ -1104,7 +1122,7 @@ Once again, I'll be using the "three sisters" document as an example::
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
soup = BeautifulSoup(sisters_doc, 'html.parser')
By passing in a filter to a method like ``find_all()``, you can
zoom in on the parts of the document you're interested in.
@@ -1242,6 +1260,8 @@ code finds all the <a> tags *and* all the <b> tags::
Now we're ready to look at the search methods in detail.
.. _find_all:
``find_all()``
--------------
@@ -1557,6 +1577,8 @@ These two lines are also equivalent::
soup.title.find_all(string=True)
soup.title(string=True)
.. _find:
``find()``
----------
@@ -3090,35 +3112,195 @@ You can use :py:meth:`Tag.copy_self` to create a copy of a
*(Tag.copy_self() is introduced in Beautiful Soup 4.13.0.)*
.. _Type-safe programming:
Low-level search interface
==========================
Type-safe programming
=====================
Almost everyone who uses Beautiful Soup to extract information from a
document can get what they need using the methods described in
`Searching the tree`_. However, there's a lower-level interface that
lets you define any matching behavior you want. Behind the scenes, the
parts of the Beautiful Soup API that most people use--``find_all()``
and the like—are actually using this low-level interface, and you
can use it directly.
If you're trying to write type-safe Python using a type checker,
you'll probably find yourself frustrated by the methods described in
`Searching the tree`_. When you call a method like ``find_all``,
*you* know whether you ought to be getting ``Tag`` objects or
``NavigableString`` objects in return, but your type checker doesn't
necessarily know. These methods just have too many possible
combinations of inputs and outputs.
*(Access to the low-level search interface is a new feature in
Beautiful Soup 4.13.0.)*
Because of this, most code that uses Beautiful Soup will generate
errors like these when run through a type checker like ``pyright`` or
``mypy``:
* ``error: Incompatible types in assignment (expression has type "ResultSet[PageElement | Tag | NavigableString]", variable has type "ResultSet[Tag]") [assignment]``
* ``Type "_QueryResults" is not assignable to declared type "ResultSet[Tag]"``
If you're using Beautiful Soup to write a quick script, this doesn't
matter. But if you're using Beautiful Soup in production code and
trying to be type-safe, it can be annoying to cast the results of
every :ref:`find <find>` or :ref:`find_all <find_all>` call to the appropriate data type.
Fortunately, you can use the :py:class:`SoupStrainer` class to write
type-safe code fairly easily.
*(Although the SoupStrainer class has been part of Beautiful
Soup for many years, using it as described below to do type-safe
programming requires new features that were introduced in Beautiful
Soup 4.14.0.)*
.. py:class:: SoupStrainer
The :py:class:`SoupStrainer` class encapsulates a set of rules for
finding tags and strings in a document. The :py:class:`SoupStrainer`
constructor takes the same arguments as a typical method from
`Searching the tree`_: :ref:`name <name>`, :ref:`attrs <attrs>`,
:ref:`string <string>`, and :ref:`**kwargs <kwargs>`. Here are four
example :py:class:`SoupStrainer` objects::
from bs4 import SoupStrainer
only_a_tags = SoupStrainer("a")
only_tags_with_id_link2 = SoupStrainer(id="link2")
only_a_tags_with_id_link2 = SoupStrainer(name="a", id="link2")
def is_short_string(string):
return string is not None and len(string) < 10
only_short_strings = SoupStrainer(string=is_short_string)
I'm going to bring back the "three sisters" document again, and we'll
see how :py:class:`SoupStrainer` objects can be used to find parts of
a document in a way that will pass type checking::
from bs4 import BeautifulSoup, Tag, NavigableString, ResultSet
from typing import Optional, Sequence
sisters_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
three_sisters = BeautifulSoup(sisters_doc, "html.parser")
.. py:method:: Tag.find_tag
This method takes a :py:class:`SoupStrainer` (or other
:py:class:`ElementFilter`; see below) and returns either the first
:py:class:`Tag` that matches it, or ``None`` if there is no match.
Since :py:class:`BeautifulSoup` is itself a :py:class:`Tag` object,
you can call all of these methods on the :py:class:`BeautifulSoup`
object itself. This is probably the most common way to use them::
tag:Optional[Tag] = three_sisters.find_tag(only_a_tags)
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
You can also call these methods on any :py:class:`Tag` in the
document, though to be type safe you'll probably need to assert that
the :py:class:`Tag` isn't ``None``::
title:Optional[Tag] = three_sisters.title
assert title is not None
tag = title.find_tag(only_a_tags)
# None
.. py:method:: Tag.find_string
This method takes a :py:class:`SoupStrainer` (or other
:py:class:`ElementFilter`) and returns either the first
:py:class:`NavigableString` that matches it, or ``None`` if there is no
match::
string:Optional[NavigableString] = three_sisters.find_string(only_short_strings)
# 'Elsie'
.. py:method:: Tag.find_tags
This method takes a :py:class:`SoupStrainer` (or other
:py:class:`ElementFilter`) and returns a :py:class:`ResultSet`
containing all of the matching :py:class:`Tag` objects::
tags:Sequence[Tag] = three_sisters.find_tags(only_a_tags)
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
.. py:method: Tag.find_strings
This method takes a :py:class:`SoupStrainer` (or other
:py:class:`ElementFilter`) and returns a :py:class:`ResultSet`
containing all of the matching :py:class:`NavigableString` objects::
strings:Sequence[NavigableString] = three_sisters.find_strings(only_short_strings)
# ['\n', '\n', '\n', 'Elsie', ',\n ', 'Lacie', ' and\n ', 'Tillie', '\n', '...', '\n']
.. py:method:: SoupStrainer.find_tag(generator)
.. py:method:: SoupStrainer.find_tags(generator)
.. py:method:: SoupStrainer.find_string(generator)
.. py:method:: SoupStrainer.find_strings(generator)
The four :py:class:`Tag` methods mentioned above provide type-safe
equivalents of :py:meth:`Tag.find` and :py:meth:`Tag.find_all`. If you
want to call one of the other ``find*`` methods in a type-safe way,
the :py:class:`SoupStrainer` class itself offers four methods that you
can use, with the same names as the :py:class:`Tag` methods. The twist
is that the :py:class:`SoupStrainer` methods need to take a generator
as an argument. The generator determines which tags and strings will be
considered for matching in the first place.
The generators to use are the ones described in `Navigating the tree`_:
``next_elements``, ``previous_siblings``, ``self_and_parents``, and so
on. Here are some examples of type-safe Python using these generators::
link:Tag = three_sisters.a
assert link is not None
only_a_tags.find_tag(link.next_siblings)
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
only_short_strings.find_strings(link.next_elements)
link.find_all_next(only_short_strings)
# ['Elsie', ',\n ', 'Lacie', ' and\n ', 'Tillie', '\n', '...', '\n']
# This type-safe code...
SoupStrainer("title").find_tags(link.previous_elements)
# [<title>The Dormouse's story</title>]
# ...does the same thing as this simpler, but non-type-safe code:
link.find_all_previous("title")
# [<title>The Dormouse's story</title>]
Custom element filtering
------------------------
========================
.. py:class:: ElementFilter
The :py:class:`ElementFilter` class is your entry point to the
low-level interface. To use it, define a function that takes a
When the ``find`` methods and :py:class:`SoupStrainer` aren't enough
to support your complicated logic for locating something in a
document, you can use :py:class:`ElementFilter` instead. This class
lets you completely customize which elements are considered, in which
order, and what it means to "match" an element. In fact,
:py:class:`SoupStrainer` is just a subclass of :py:class:`ElementFilter`,
designed to work like the ``find`` methods that are the core of
Beautiful Soup.
*(The ElementFilter class was introduced in Beautiful Soup
4.13.0.)*
To use :py:class:`ElementFilter`, define a function that takes a
:py:class:`PageElement` object (which could be either a
:py:class:`Tag` or a :py:class:`NavigableString`). The function must
return ``True`` if the element matches your custom criteria, and
``False`` if it doesn't.
This example function looks for content-containing tags and strings,
but skips whitespace-only strings::
This example function looks for both tags and strings, but but skips
strings that only contain whitespace::
from bs4 import Tag, NavigableString
def non_whitespace_element_func(tag_or_string):
@@ -3132,16 +3314,26 @@ but skips whitespace-only strings::
(isinstance(tag_or_string, NavigableString) and
tag_or_string.strip() != ""))
Once you have a function, pass it into the :py:class:`ElementFilter` constructor::
Once you have a function that matches what you want to match, pass it
into the :py:class:`ElementFilter` constructor::
from bs4.filter import ElementFilter
non_whitespace_filter = ElementFilter(non_whitespace_element_func)
You can then use this :py:class:`ElementFilter` object as the first
argument to any of the `Searching the tree`_ methods. Whatever
criteria you defined in your function will be used instead of the
default Beautiful Soup match logic::
and only argument to any of the methods described in `Searching the
tree`_ or `Type-safe programming`_. You can also call methods like
:py:meth:`find_string <SoupStrainer.find_string>` or
:py:meth:`find_tags <SoupStrainer.find_tags>` on the
:py:class:`ElementFilter` object itself, though you'll also have to
pass in a generator.
However you use it, Beautiful Soup will use your function instead of its
default match logic. Every potential match will be run through your
function, and the only :py:class:`PageElement` objects returned will
be the ones where your function returned ``True``::
from bs4 import BeautifulSoup
small_doc = """
<p>
@@ -3162,25 +3354,36 @@ default Beautiful Soup match logic::
soup.find("i").find_next_siblings(non_whitespace_filter)
# ['\n and\n ', <u>underline</u>]
Every potential match will be run through your function, and the only
:py:class:`PageElement` objects returned will be the ones where your
function returned ``True``.
soup.find_strings(non_whitespace_filter)
# ['bold', 'italic', '\n and\n ', 'underline']
To summarize the function-based matching behaviors,
soup.find_tags(non_whitespace_filter)
# [<p><b>bold</b><i>italic</i> and <u>underline</u></p>, <b>bold</b>, <i>italic</i>, <u>underline</u>]
* A function passed as the first argument to a search method
(or equivalently, using the ``name`` argument) considers only
:py:class:`Tag` objects.
* A function passed to a search method using the ``string`` argument
considers only :py:class:`NavigableString` objects.
* A function passed to a search method using an :py:class:`ElementFilter`
object considers both :py:class:`Tag` and :py:class:`NavigableString`
objects.
non_whitespace_filter.find_tags(soup.find("b").next_elements)
# [<i>italic</i>, <u>underline</u>]
non_whitespace_filter.find_strings(soup.find("u").previous_siblings)
# ['\n and\n ']
non_whitespace_filter.find_strings(soup.find("u").previous_elements)
# ['\n and\n ', 'italic', 'bold']
To summarize the rules for using custom functions in matches:
* A function passed as the first argument to a ``find`` method
(or passed in using the ``name`` argument) should take a
:py:class:`Tag` object as its single argument.
* A function passed to a ``find`` method using the ``string`` argument
should take a :class:`NavigableString` object as its single argument.
* A function used to build an :py:class:`ElementFilter` object should
be prepared for either a :py:class:`Tag` or a
:py:class:`NavigableString` as its single argument.
Custom element iteration
^^^^^^^^^^^^^^^^^^^^^^^^
------------------------
.. py:method:: ElementFilter.filter()
.. py:method: ElementFilter.filter
By passing an :py:class:`ElementFilter` instance into Beautiful Soup's
tree-searching methods, you can completely customize what it means for
@@ -3190,7 +3393,8 @@ also completely customize what it means for Beautiful Soup to iterate
over the parse tree in the first place.
The :py:meth:`ElementFilter.filter()` method takes a generator that yields
a stream of :py:class:`PageElement` objects. There is no restriction
a stream of :py:class:`PageElement` objects—that is, a mixed list of
:py:class:`Tag` and :py:class:`NavigableString` objects. There is no restriction
on which :py:class:`PageElement` objects show up, how many times they
show up, or in which order. Theoretically, they don't even need to be
from the same :py:class:`BeautifulSoup` document. You can do whatever
@@ -3241,9 +3445,10 @@ Parsing only part of a document
Let's say you want to use Beautiful Soup to look at a document's <a>
tags. It's a waste of time and memory to parse the entire document and
then go over it again looking for <a> tags. It would be much faster to
ignore everything that wasn't an <a> tag in the first place. The
:py:class:`SoupStrainer` class allows you to choose which parts of an incoming
document are parsed. You just create a :py:class:`SoupStrainer` and pass it in
ignore everything that wasn't an <a> tag in the first place.
You can do this by creating a :py:class:`SoupStrainer` class object,
as described in `Type-safe programming`_, and passing it in
to the :py:class:`BeautifulSoup` constructor as the ``parse_only`` argument.
(Note that *this feature won't work if you're using the html5lib parser*.
@@ -3254,42 +3459,10 @@ make it into the parse tree, it'll crash. To avoid confusion, in the
examples below I'll be forcing Beautiful Soup to use Python's
built-in parser.)
.. py:class:: SoupStrainer
Here's what the "three sisters" document looks like when it's parsed
with the :py:class:`SoupStrainer` objects defined back in `Type-safe programming`_::
The :py:class:`SoupStrainer` class takes the same arguments as a typical
method from `Searching the tree`_: :ref:`name <name>`, :ref:`attrs
<attrs>`, :ref:`string <string>`, and :ref:`**kwargs <kwargs>`. Here are
three :py:class:`SoupStrainer` objects::
from bs4 import SoupStrainer
only_a_tags = SoupStrainer("a")
only_tags_with_id_link2 = SoupStrainer(id="link2")
def is_short_string(string):
return string is not None and len(string) < 10
only_short_strings = SoupStrainer(string=is_short_string)
I'm going to bring back the "three sisters" document one more time,
and we'll see what the document looks like when it's parsed with these
three :py:class:`SoupStrainer` objects::
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags).prettify())
print(BeautifulSoup(three_sisters, "html.parser", parse_only=only_a_tags).prettify())
# <a class="sister" href="http://example.com/elsie" id="link1">
# Elsie
# </a>
@@ -3300,12 +3473,12 @@ three :py:class:`SoupStrainer` objects::
# Tillie
# </a>
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify())
print(BeautifulSoup(sisters_doc, "html.parser", parse_only=only_tags_with_id_link2).prettify())
# <a class="sister" href="http://example.com/lacie" id="link2">
# Lacie
# </a>
print(BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings).prettify())
print(BeautifulSoup(sisters_doc, "html.parser", parse_only=only_short_strings).prettify())
# Elsie
# ,
# Lacie
@@ -3419,11 +3592,9 @@ tell Beautiful Soup to instantiate *subclasses* of :py:class:`Tag` or
class MyTag(Tag):
pass
class MyString(NavigableString):
pass
markup = "<div>some text</div>"
soup = BeautifulSoup(markup, 'html.parser')
isinstance(soup.div, MyTag)