2018-12-24 10:36:00 -05:00
# Use of this source code is governed by the MIT license.
__license__ = " MIT "
2016-07-16 11:27:24 -04:00
2011-02-27 16:51:56 -05:00
__all__ = [
2025-01-02 15:21:32 -05:00
" HTML5TreeBuilder " ,
]
2011-02-27 16:51:56 -05:00
2023-04-18 10:40:32 -04:00
from typing import (
2024-01-31 18:23:16 -05:00
Any ,
2024-01-31 16:15:22 -05:00
cast ,
Dict ,
2023-04-18 10:40:32 -04:00
Iterable ,
Optional ,
2024-02-15 10:09:57 -05:00
Sequence ,
2023-04-18 10:40:32 -04:00
TYPE_CHECKING ,
Tuple ,
Union ,
)
2024-03-14 11:57:23 -04:00
from typing_extensions import TypeAlias
2023-04-18 11:27:03 -04:00
from bs4 . _typing import (
2024-01-31 16:15:22 -05:00
_AttributeValue ,
_AttributeValues ,
2023-04-18 11:27:03 -04:00
_Encoding ,
_Encodings ,
2024-01-31 22:22:14 -05:00
_NamespaceURL ,
2023-04-18 11:27:03 -04:00
_RawMarkup ,
)
2023-04-18 10:40:32 -04:00
2012-02-24 10:37:47 -05:00
import warnings
2011-02-27 16:51:56 -05:00
from bs4 . builder import (
2021-10-24 21:15:31 -04:00
DetectsXMLParsedAsHTML ,
2011-02-27 16:51:56 -05:00
PERMISSIVE ,
HTML ,
HTML_5 ,
HTMLTreeBuilder ,
2025-01-02 15:21:32 -05:00
)
2015-06-24 17:03:40 -04:00
from bs4 . element import (
NamespacedAttribute ,
2024-01-31 16:15:22 -05:00
PageElement ,
2018-12-30 21:13:03 -05:00
nonwhitespace_re ,
2015-06-24 17:03:40 -04:00
)
2011-02-27 16:51:56 -05:00
import html5lib
2015-12-08 15:47:30 +00:00
from html5lib . constants import (
namespaces ,
2025-01-02 15:21:32 -05:00
)
2011-02-27 16:51:56 -05:00
from bs4 . element import (
Comment ,
Doctype ,
NavigableString ,
Tag ,
2024-01-31 16:15:22 -05:00
)
2025-01-02 15:21:32 -05:00
2024-01-31 16:15:22 -05:00
if TYPE_CHECKING :
from bs4 import BeautifulSoup
2011-02-27 16:51:56 -05:00
2023-04-06 21:17:18 -04:00
from html5lib . treebuilders import base as treebuilder_base
2016-07-16 22:28:40 -04:00
2023-04-18 10:40:32 -04:00
2011-02-27 16:51:56 -05:00
class HTML5TreeBuilder ( HTMLTreeBuilder ) :
2023-04-06 14:30:55 -04:00
""" Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
build a tree .
2020-04-05 15:43:58 -04:00
2023-04-06 14:30:55 -04:00
Note that ` HTML5TreeBuilder ` does not support some common HTML
` TreeBuilder ` features . Some of these features could theoretically
2020-04-05 15:43:58 -04:00
be implemented , but at the very least it ' s quite difficult,
because html5lib moves the parse tree around as it ' s being built.
2023-04-06 14:30:55 -04:00
Specifically :
2020-04-05 15:43:58 -04:00
2023-04-06 14:30:55 -04:00
* This ` TreeBuilder ` doesn ' t use different subclasses of
` NavigableString ` ( e . g . ` Script ` ) based on the name of the tag
in which the string was found .
* You can ' t use a `SoupStrainer` to parse only part of a document.
2020-04-05 15:43:58 -04:00
"""
2011-02-27 16:51:56 -05:00
2025-01-02 15:21:32 -05:00
NAME : str = " html5lib "
2014-12-07 09:31:30 -05:00
2025-07-26 10:25:18 -04:00
features : Iterable [ str ] = [ NAME , PERMISSIVE , HTML_5 , HTML ]
2011-02-27 16:51:56 -05:00
2023-04-06 14:30:55 -04:00
#: html5lib can tell us which line number and position in the
#: original file is the source of an element.
2025-01-02 15:21:32 -05:00
TRACKS_LINE_NUMBERS : bool = True
2024-01-31 22:22:14 -05:00
2025-01-02 15:21:32 -05:00
underlying_builder : " TreeBuilderForHtml5lib " #: :meta private:
2024-02-15 10:09:57 -05:00
user_specified_encoding : Optional [ _Encoding ]
2024-01-31 22:22:14 -05:00
2025-01-02 15:21:32 -05:00
def prepare_markup (
self ,
markup : _RawMarkup ,
user_specified_encoding : Optional [ _Encoding ] = None ,
document_declared_encoding : Optional [ _Encoding ] = None ,
exclude_encodings : Optional [ _Encodings ] = None ,
) - > Iterable [ Tuple [ _RawMarkup , Optional [ _Encoding ] , Optional [ _Encoding ] , bool ] ] :
2011-02-27 16:51:56 -05:00
# Store the user-specified encoding for use later on.
self . user_specified_encoding = user_specified_encoding
2015-06-27 09:55:40 -04:00
# document_declared_encoding and exclude_encodings aren't used
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
2023-04-16 13:53:12 -04:00
for variable , name in (
2025-01-02 15:21:32 -05:00
( document_declared_encoding , " document_declared_encoding " ) ,
( exclude_encodings , " exclude_encodings " ) ,
2023-04-16 13:53:12 -04:00
) :
if variable :
warnings . warn (
f " You provided a value for { name } , but the html5lib tree builder doesn ' t support { name } . " ,
2025-01-02 15:21:32 -05:00
stacklevel = 3 ,
2023-04-16 13:53:12 -04:00
)
2021-10-24 21:15:31 -04:00
# html5lib only parses HTML, so if it's given XML that's worth
# noting.
2025-01-02 15:21:32 -05:00
DetectsXMLParsedAsHTML . warn_if_markup_looks_like_xml ( markup , stacklevel = 3 )
2021-10-24 21:15:31 -04:00
2013-05-31 09:17:11 -04:00
yield ( markup , None , None , False )
2011-02-27 16:51:56 -05:00
# These methods are defined by Beautiful Soup.
2025-01-02 15:21:32 -05:00
def feed ( self , markup : _RawMarkup ) - > None :
2023-04-06 14:30:55 -04:00
""" Run some incoming markup through some parsing process,
populating the ` BeautifulSoup ` object in ` HTML5TreeBuilder . soup ` .
"""
2024-01-31 16:15:22 -05:00
if self . soup is not None and self . soup . parse_only is not None :
2023-01-27 12:45:14 -05:00
warnings . warn (
" You provided a value for parse_only, but the html5lib tree builder doesn ' t support parse_only. The entire document will be parsed. " ,
2025-01-02 15:21:32 -05:00
stacklevel = 4 ,
2023-01-27 12:45:14 -05:00
)
2024-01-31 22:22:14 -05:00
2024-02-12 12:23:48 -05:00
# self.underlying_builder is probably None now, but it'll be set
# when html5lib calls self.create_treebuilder().
2011-02-27 16:51:56 -05:00
parser = html5lib . HTMLParser ( tree = self . create_treebuilder )
2024-01-31 22:22:14 -05:00
assert self . underlying_builder is not None
2019-07-21 15:50:49 -04:00
self . underlying_builder . parser = parser
2016-07-16 22:28:40 -04:00
extra_kwargs = dict ( )
2021-09-07 20:09:32 -04:00
if not isinstance ( markup , str ) :
2024-01-31 22:22:14 -05:00
# kwargs, specifically override_encoding, will eventually
# be passed in to html5lib's
# HTMLBinaryInputStream.__init__.
2025-01-02 15:21:32 -05:00
extra_kwargs [ " override_encoding " ] = self . user_specified_encoding
2024-01-31 22:22:14 -05:00
2025-05-26 11:45:27 -04:00
doc = parser . parse ( markup , * * extra_kwargs ) # type:ignore
2025-01-02 15:21:32 -05:00
2011-02-27 16:51:56 -05:00
# Set the character encoding detected by the tokenizer.
2021-09-07 20:09:32 -04:00
if isinstance ( markup , str ) :
2011-02-27 16:51:56 -05:00
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc . original_encoding = None
else :
2025-05-26 11:45:27 -04:00
original_encoding = parser . tokenizer . stream . charEncoding [ 0 ] # type:ignore
2023-04-06 22:15:13 -04:00
# The encoding is an html5lib Encoding object. We want to
# use a string for compatibility with other tree builders.
original_encoding = original_encoding . name
2016-07-16 22:28:40 -04:00
doc . original_encoding = original_encoding
2019-07-21 15:50:49 -04:00
self . underlying_builder . parser = None
2023-04-06 14:30:55 -04:00
2025-01-02 15:21:32 -05:00
def create_treebuilder (
self , namespaceHTMLElements : bool
) - > " TreeBuilderForHtml5lib " :
2023-04-06 14:30:55 -04:00
""" Called by html5lib to instantiate the kind of class it
calls a ' TreeBuilder ' .
2024-01-31 22:22:14 -05:00
: param namespaceHTMLElements : Whether or not to namespace HTML elements .
2023-04-06 14:30:55 -04:00
: meta private :
"""
2011-02-27 16:51:56 -05:00
self . underlying_builder = TreeBuilderForHtml5lib (
2025-01-02 15:21:32 -05:00
namespaceHTMLElements , self . soup , store_line_numbers = self . store_line_numbers
2019-07-21 15:50:49 -04:00
)
2011-02-27 16:51:56 -05:00
return self . underlying_builder
2025-01-02 15:21:32 -05:00
def test_fragment_to_document ( self , fragment : str ) - > str :
2011-02-27 16:51:56 -05:00
""" See `TreeBuilder`. """
2025-01-02 15:21:32 -05:00
return " <html><head></head><body> %s </body></html> " % fragment
2011-02-27 16:51:56 -05:00
2016-07-16 22:28:40 -04:00
class TreeBuilderForHtml5lib ( treebuilder_base . TreeBuilder ) :
2025-01-02 15:21:32 -05:00
soup : " BeautifulSoup " #: :meta private:
parser : Optional [ html5lib . HTMLParser ] #: :meta private:
def __init__ (
self ,
namespaceHTMLElements : bool ,
soup : Optional [ " BeautifulSoup " ] = None ,
store_line_numbers : bool = True ,
* * kwargs : Any ,
) :
2015-12-08 15:47:30 +00:00
if soup :
self . soup = soup
else :
2025-01-02 15:21:32 -05:00
warnings . warn (
" The optionality of the ' soup ' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: ' soup ' is now required. If you can ' t pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation. " ,
DeprecationWarning ,
stacklevel = 2 ,
)
2015-12-08 15:47:30 +00:00
from bs4 import BeautifulSoup
2025-01-02 15:21:32 -05:00
2024-02-15 12:02:26 -05:00
# TODO: Why is the parser 'html.parser' here? Using
# html5lib doesn't cause an infinite loop and is more
# accurate. Best to get rid of this entire section, I think.
2019-07-21 15:50:49 -04:00
self . soup = BeautifulSoup (
2025-01-02 15:21:32 -05:00
" " , " html.parser " , store_line_numbers = store_line_numbers , * * kwargs
2019-07-21 15:50:49 -04:00
)
2020-04-05 15:43:58 -04:00
# TODO: What are **kwargs exactly? Should they be passed in
# here in addition to/instead of being passed to the BeautifulSoup
# constructor?
2011-02-27 16:51:56 -05:00
super ( TreeBuilderForHtml5lib , self ) . __init__ ( namespaceHTMLElements )
2024-02-12 12:23:48 -05:00
# This will be set later to a real html5lib HTMLParser object,
# which we can use to track the current line number.
2019-07-21 15:50:49 -04:00
self . parser = None
self . store_line_numbers = store_line_numbers
2025-01-02 15:21:32 -05:00
def documentClass ( self ) - > " Element " :
2011-02-27 16:51:56 -05:00
self . soup . reset ( )
return Element ( self . soup , self . soup , None )
2025-01-02 15:21:32 -05:00
def insertDoctype ( self , token : Dict [ str , Any ] ) - > None :
name : str = cast ( str , token [ " name " ] )
publicId : Optional [ str ] = cast ( Optional [ str ] , token [ " publicId " ] )
systemId : Optional [ str ] = cast ( Optional [ str ] , token [ " systemId " ] )
2011-02-27 16:51:56 -05:00
doctype = Doctype . for_name_and_ids ( name , publicId , systemId )
self . soup . object_was_parsed ( doctype )
2025-01-02 15:21:32 -05:00
def elementClass ( self , name : str , namespace : str ) - > " Element " :
sourceline : Optional [ int ] = None
sourcepos : Optional [ int ] = None
2024-02-12 12:23:48 -05:00
if self . parser is not None and self . store_line_numbers :
2019-07-21 15:50:49 -04:00
# This represents the point immediately after the end of the
# tag. We don't know when the tag started, but we do know
# where it ended -- the character just before this one.
2025-05-26 11:45:27 -04:00
sourceline , sourcepos = self . parser . tokenizer . stream . position ( ) # type:ignore
2024-02-12 12:23:48 -05:00
assert sourcepos is not None
2025-01-02 15:21:32 -05:00
sourcepos = sourcepos - 1
2024-01-31 22:22:14 -05:00
tag = self . soup . new_tag (
name , namespace , sourceline = sourceline , sourcepos = sourcepos
)
2019-07-21 15:50:49 -04:00
2012-02-15 11:51:11 -05:00
return Element ( tag , self . soup , namespace )
2011-02-27 16:51:56 -05:00
2025-01-02 15:21:32 -05:00
def commentClass ( self , data : str ) - > " TextNode " :
2011-02-27 16:51:56 -05:00
return TextNode ( Comment ( data ) , self . soup )
2025-01-02 15:21:32 -05:00
def fragmentClass ( self ) - > " Element " :
2024-01-31 18:23:16 -05:00
""" This is only used by html5lib HTMLParser.parseFragment(),
2024-02-01 11:48:02 -05:00
which is never used by Beautiful Soup , only by the html5lib
unit tests . Since we don ' t currently hook into those tests,
the implementation is left blank .
2024-02-01 11:30:08 -05:00
"""
2024-02-01 11:48:02 -05:00
raise NotImplementedError ( )
2024-01-31 18:23:16 -05:00
2025-01-02 15:21:32 -05:00
def getFragment ( self ) - > " Element " :
2024-02-01 11:48:02 -05:00
""" This is only used by the html5lib unit tests. Since we
don ' t currently hook into those tests, the implementation is
left blank .
"""
raise NotImplementedError ( )
2011-02-27 16:51:56 -05:00
2025-01-02 15:21:32 -05:00
def appendChild ( self , node : " Element " ) - > None :
2024-02-01 11:30:08 -05:00
# TODO: This code is not covered by the BS4 tests, and
# apparently not triggered by the html5lib test suite either.
2024-02-12 12:23:48 -05:00
# But it doesn't seem test-specific and there are calls to it
# (or a method with the same name) all over html5lib, so I'm
# leaving the implementation in place rather than replacing it
# with NotImplementedError()
2012-02-15 11:51:11 -05:00
self . soup . append ( node . element )
2011-02-27 16:51:56 -05:00
2025-01-02 15:21:32 -05:00
def getDocument ( self ) - > " BeautifulSoup " :
2011-02-27 16:51:56 -05:00
return self . soup
2025-05-26 11:45:27 -04:00
def testSerializer ( self , element : " Element " ) - > None :
2024-02-01 11:48:02 -05:00
""" This is only used by the html5lib unit tests. Since we
don ' t currently hook into those tests, the implementation is
left blank .
"""
raise NotImplementedError ( )
2015-12-08 15:47:30 +00:00
2025-01-02 15:21:32 -05:00
2011-02-27 16:51:56 -05:00
class AttrList ( object ) :
2024-01-31 18:23:16 -05:00
""" Represents a Tag ' s attributes in a way compatible with html5lib. """
2024-01-31 16:15:22 -05:00
2025-01-02 15:21:32 -05:00
element : Tag
attrs : _AttributeValues
2024-01-31 16:15:22 -05:00
2025-01-02 15:21:32 -05:00
def __init__ ( self , element : Tag ) :
2011-02-27 16:51:56 -05:00
self . element = element
self . attrs = dict ( self . element . attrs )
2024-01-31 16:15:22 -05:00
def __iter__ ( self ) - > Iterable [ Tuple [ str , _AttributeValue ] ] :
2011-05-21 12:13:26 -04:00
return list ( self . attrs . items ( ) ) . __iter__ ( )
2024-01-31 16:15:22 -05:00
2025-01-02 15:21:32 -05:00
def __setitem__ ( self , name : str , value : _AttributeValue ) - > None :
2015-06-24 17:03:40 -04:00
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
2021-10-23 15:20:01 -04:00
list_attr = self . element . cdata_list_attributes or { }
2025-01-02 15:21:32 -05:00
if name in list_attr . get ( " * " , [ ] ) or (
self . element . name in list_attr
and name in list_attr . get ( self . element . name , [ ] )
) :
2015-09-28 19:53:43 -04:00
# A node that is being cloned may have already undergone
2024-01-31 16:15:22 -05:00
# this procedure. Check for this and skip it.
2015-09-28 19:53:43 -04:00
if not isinstance ( value , list ) :
2024-01-31 22:22:14 -05:00
assert isinstance ( value , str )
2025-01-02 15:21:32 -05:00
value = self . element . attribute_value_list_class (
nonwhitespace_re . findall ( value )
)
2011-02-27 16:51:56 -05:00
self . element [ name ] = value
2024-01-31 16:15:22 -05:00
2024-01-31 18:23:16 -05:00
def items ( self ) - > Iterable [ Tuple [ str , _AttributeValue ] ] :
2011-05-21 12:13:26 -04:00
return list ( self . attrs . items ( ) )
2024-01-31 16:15:22 -05:00
2024-01-31 18:23:16 -05:00
def keys ( self ) - > Iterable [ str ] :
2011-05-21 12:13:26 -04:00
return list ( self . attrs . keys ( ) )
2024-01-31 16:15:22 -05:00
2024-01-31 18:23:16 -05:00
def __len__ ( self ) - > int :
2012-03-01 11:54:25 -05:00
return len ( self . attrs )
2024-01-31 16:15:22 -05:00
2025-01-02 15:21:32 -05:00
def __getitem__ ( self , name : str ) - > _AttributeValue :
2011-02-27 16:51:56 -05:00
return self . attrs [ name ]
2024-01-31 16:15:22 -05:00
2025-01-02 15:21:32 -05:00
def __contains__ ( self , name : str ) - > bool :
2011-05-21 12:13:26 -04:00
return name in list ( self . attrs . keys ( ) )
2011-02-27 16:51:56 -05:00
2025-01-02 15:21:32 -05:00
2024-02-01 13:15:55 -05:00
class BeautifulSoupNode ( treebuilder_base . Node ) :
2025-01-02 15:21:32 -05:00
element : PageElement
soup : " BeautifulSoup "
namespace : Optional [ _NamespaceURL ]
2024-01-31 22:22:14 -05:00
2024-02-01 11:30:08 -05:00
@property
def nodeType ( self ) - > int :
""" Return the html5lib constant corresponding to the type of
the underlying DOM object .
NOTE : This property is only accessed by the html5lib test
suite , not by Beautiful Soup proper .
"""
2024-02-01 12:04:26 -05:00
raise NotImplementedError ( )
2024-02-01 11:30:08 -05:00
2024-02-01 13:15:55 -05:00
# TODO-TYPING: typeshed stubs are incorrect about this;
# cloneNode returns a new Node, not None.
2025-05-26 11:45:27 -04:00
def cloneNode ( self ) - > treebuilder_base . Node : # type:ignore
2024-02-01 13:15:55 -05:00
raise NotImplementedError ( )
2024-02-01 11:30:08 -05:00
2025-01-02 15:21:32 -05:00
2024-02-01 13:15:55 -05:00
class Element ( BeautifulSoupNode ) :
2025-01-02 15:21:32 -05:00
element : Tag
namespace : Optional [ _NamespaceURL ]
2024-02-01 13:15:55 -05:00
2025-01-02 15:21:32 -05:00
def __init__ (
self , element : Tag , soup : " BeautifulSoup " , namespace : Optional [ _NamespaceURL ]
) :
2024-02-01 13:15:55 -05:00
treebuilder_base . Node . __init__ ( self , element . name )
self . element = element
self . soup = soup
self . namespace = namespace
2025-01-02 15:21:32 -05:00
def appendChild ( self , node : " BeautifulSoupNode " ) - > None :
string_child : Optional [ NavigableString ] = None
child : PageElement
2025-01-02 15:43:56 -05:00
if type ( node . element ) is NavigableString :
2013-06-03 10:33:03 -04:00
string_child = child = node . element
else :
child = node . element
2024-02-01 13:15:55 -05:00
node . parent = self
2013-06-03 10:33:03 -04:00
2025-01-02 15:21:32 -05:00
if (
child is not None
and child . parent is not None
and not isinstance ( child , str )
) :
2013-08-13 10:44:46 -04:00
node . element . extract ( )
2025-01-02 15:21:32 -05:00
if (
string_child is not None
and self . element . contents
2025-01-02 15:43:56 -05:00
and type ( self . element . contents [ - 1 ] ) is NavigableString
2025-01-02 15:21:32 -05:00
) :
2013-06-03 10:33:03 -04:00
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
2012-02-15 11:03:44 -05:00
# "a</a>a</a>a</a>..."
old_element = self . element . contents [ - 1 ]
2013-06-03 10:33:03 -04:00
new_element = self . soup . new_string ( old_element + string_child )
2012-02-15 11:03:44 -05:00
old_element . replace_with ( new_element )
2013-08-13 10:44:46 -04:00
self . soup . _most_recent_element = new_element
2011-02-27 16:51:56 -05:00
else :
2021-09-07 20:09:32 -04:00
if isinstance ( node , str ) :
2013-06-03 10:33:03 -04:00
# Create a brand new NavigableString from this string.
child = self . soup . new_string ( node )
2013-08-13 10:44:46 -04:00
# Tell Beautiful Soup to act as if it parsed this element
# immediately after the parent's last descendant. (Or
# immediately after the parent, if it has no children.)
if self . element . contents :
most_recent_element = self . element . _last_descendant ( False )
2013-10-18 13:03:06 -04:00
elif self . element . next_element is not None :
# Something from further ahead in the parse tree is
# being inserted into this earlier element. This is
# very annoying because it means an expensive search
# for the last element in the tree.
most_recent_element = self . soup . _last_descendant ( )
2013-08-13 10:44:46 -04:00
else :
most_recent_element = self . element
self . soup . object_was_parsed (
2025-01-02 15:21:32 -05:00
child , parent = self . element , most_recent_element = most_recent_element
)
2011-02-27 16:51:56 -05:00
2024-01-31 16:15:22 -05:00
def getAttributes ( self ) - > AttrList :
2016-12-19 18:43:56 -05:00
if isinstance ( self . element , Comment ) :
return { }
2011-02-27 16:51:56 -05:00
return AttrList ( self . element )
2024-02-01 13:15:55 -05:00
# An HTML5lib attribute name may either be a single string,
# or a tuple (namespace, name).
_Html5libAttributeName : TypeAlias = Union [ str , Tuple [ str , str ] ]
# Now we can define the type this method accepts as a dictionary
# mapping those attribute names to single string values.
_Html5libAttributes : TypeAlias = Dict [ _Html5libAttributeName , str ]
2025-01-02 15:21:32 -05:00
def setAttributes ( self , attributes : Optional [ _Html5libAttributes ] ) - > None :
if attributes is not None and len ( attributes ) > 0 :
2024-02-01 13:15:55 -05:00
# Replace any namespaced attributes with
# NamespacedAttribute objects.
2011-05-21 12:13:26 -04:00
for name , value in list ( attributes . items ( ) ) :
2012-02-16 16:33:40 -05:00
if isinstance ( name , tuple ) :
2012-04-26 10:08:45 -04:00
new_name = NamespacedAttribute ( * name )
del attributes [ name ]
attributes [ new_name ] = value
2024-02-01 13:15:55 -05:00
# We can now cast attributes to the type of Dict
# used by Beautiful Soup.
normalized_attributes = cast ( _AttributeValues , attributes )
# Values for tags like 'class' came in as single strings;
# replace them with lists of strings as appropriate.
2012-04-26 10:08:45 -04:00
self . soup . builder . _replace_cdata_list_attribute_values (
2025-01-02 15:21:32 -05:00
self . name , normalized_attributes
)
2024-02-01 13:15:55 -05:00
# Then set the attributes on the Tag associated with this
# BeautifulSoupNode.
for name , value_or_values in list ( normalized_attributes . items ( ) ) :
self . element [ name ] = value_or_values
2012-04-26 10:08:45 -04:00
2011-02-27 16:51:56 -05:00
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
2012-02-15 11:03:44 -05:00
#
2012-02-15 12:05:15 -05:00
# The Tag constructor called this method when the Tag was created,
# but we just set/changed the attributes, so call it again.
2012-04-18 14:44:36 -04:00
self . soup . builder . set_up_substitutions ( self . element )
2025-01-02 15:21:32 -05:00
2011-02-27 16:51:56 -05:00
attributes = property ( getAttributes , setAttributes )
2025-01-02 15:21:32 -05:00
def insertText (
self , data : str , insertBefore : Optional [ " BeautifulSoupNode " ] = None
) - > None :
2015-12-08 16:02:57 +00:00
text = TextNode ( self . soup . new_string ( data ) , self . soup )
2011-02-27 16:51:56 -05:00
if insertBefore :
2015-12-08 16:02:57 +00:00
self . insertBefore ( text , insertBefore )
2011-02-27 16:51:56 -05:00
else :
2015-12-08 16:02:57 +00:00
self . appendChild ( text )
2011-02-27 16:51:56 -05:00
2025-01-02 15:21:32 -05:00
def insertBefore (
self , node : " BeautifulSoupNode " , refNode : " BeautifulSoupNode "
) - > None :
2012-02-15 12:01:33 -05:00
index = self . element . index ( refNode . element )
2025-01-02 15:21:32 -05:00
if (
2025-01-02 15:43:56 -05:00
type ( node . element ) is NavigableString
2025-01-02 15:21:32 -05:00
and self . element . contents
2025-01-02 15:43:56 -05:00
and type ( self . element . contents [ index - 1 ] ) is NavigableString
2025-01-02 15:21:32 -05:00
) :
2011-02-27 16:51:56 -05:00
# (See comments in appendChild)
2025-01-02 15:21:32 -05:00
old_node = self . element . contents [ index - 1 ]
2025-01-02 15:43:56 -05:00
assert type ( old_node ) is NavigableString
2012-02-15 11:03:44 -05:00
new_str = self . soup . new_string ( old_node + node . element )
old_node . replace_with ( new_str )
2011-02-27 16:51:56 -05:00
else :
self . element . insert ( index , node . element )
node . parent = self
2025-01-02 15:21:32 -05:00
def removeChild ( self , node : " Element " ) - > None :
2011-02-27 16:51:56 -05:00
node . element . extract ( )
2025-01-02 15:21:32 -05:00
def reparentChildren ( self , new_parent : " Element " ) - > None :
2013-08-13 10:44:46 -04:00
""" Move all of this tag ' s children into another tag. """
2020-05-17 13:58:15 -04:00
# print("MOVE", self.element.contents)
# print("FROM", self.element)
# print("TO", new_parent.element)
2016-12-19 18:43:56 -05:00
2013-08-13 10:44:46 -04:00
element = self . element
new_parent_element = new_parent . element
# Determine what this tag's next_element will be once all the children
# are removed.
final_next_element = element . next_sibling
new_parents_last_descendant = new_parent_element . _last_descendant ( False , False )
if len ( new_parent_element . contents ) > 0 :
# The new parent already contains children. We will be
# appending this tag's children to the end.
2024-01-31 22:22:14 -05:00
# We can make this assertion since we know new_parent has
# children.
assert new_parents_last_descendant is not None
2013-08-13 10:44:46 -04:00
new_parents_last_child = new_parent_element . contents [ - 1 ]
2025-01-02 15:21:32 -05:00
new_parents_last_descendant_next_element = (
new_parents_last_descendant . next_element
)
2013-08-13 10:44:46 -04:00
else :
# The new parent contains no children.
new_parents_last_child = None
new_parents_last_descendant_next_element = new_parent_element . next_element
to_append = element . contents
if len ( to_append ) > 0 :
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append [ 0 ]
2018-12-22 14:25:35 -07:00
if new_parents_last_descendant is not None :
2015-06-23 22:28:11 -04:00
first_child . previous_element = new_parents_last_descendant
else :
first_child . previous_element = new_parent_element
2013-08-13 10:44:46 -04:00
first_child . previous_sibling = new_parents_last_child
2018-12-22 14:25:35 -07:00
if new_parents_last_descendant is not None :
2015-06-23 22:28:11 -04:00
new_parents_last_descendant . next_element = first_child
else :
new_parent_element . next_element = first_child
2018-12-22 14:25:35 -07:00
if new_parents_last_child is not None :
2015-06-23 22:28:11 -04:00
new_parents_last_child . next_sibling = first_child
2013-08-13 10:44:46 -04:00
2016-12-19 18:43:56 -05:00
# Find the very last element being moved. It is now the
# parent's last descendant. It has no .next_sibling and
# its .next_element is whatever the previous last
# descendant had.
2024-02-01 13:15:55 -05:00
last_childs_last_descendant = to_append [ - 1 ] . _last_descendant (
is_initialized = False , accept_self = True
)
2016-12-19 18:43:56 -05:00
2024-02-01 13:15:55 -05:00
# Since we passed accept_self=True into _last_descendant,
# there's no possibility that the result is None.
assert last_childs_last_descendant is not None
2025-01-02 15:21:32 -05:00
last_childs_last_descendant . next_element = (
new_parents_last_descendant_next_element
)
2018-12-22 14:25:35 -07:00
if new_parents_last_descendant_next_element is not None :
2024-02-01 13:15:55 -05:00
# TODO-COVERAGE: This code has no test coverage and
# I'm not sure how to get html5lib to go through this
# path, but it's just the other side of the previous
# line.
2025-01-02 15:21:32 -05:00
new_parents_last_descendant_next_element . previous_element = (
last_childs_last_descendant
)
2016-12-19 18:43:56 -05:00
last_childs_last_descendant . next_sibling = None
2013-08-13 10:44:46 -04:00
for child in to_append :
child . parent = new_parent_element
new_parent_element . contents . append ( child )
# Now that this element has no children, change its .next_element.
element . contents = [ ]
element . next_element = final_next_element
2011-02-27 16:51:56 -05:00
2020-05-17 13:58:15 -04:00
# print("DONE WITH MOVE")
# print("FROM", self.element)
# print("TO", new_parent_element)
2015-06-23 22:28:11 -04:00
2024-02-01 13:15:55 -05:00
# TODO-TYPING: typeshed stubs are incorrect about this;
# hasContent returns a boolean, not None.
2025-05-26 11:45:27 -04:00
def hasContent ( self ) - > bool : # type:ignore
2024-02-01 13:15:55 -05:00
return len ( self . element . contents ) > 0
2024-02-01 11:48:02 -05:00
# TODO-TYPING: typeshed stubs are incorrect about this;
2024-01-31 16:15:22 -05:00
# cloneNode returns a new Node, not None.
2025-05-26 11:45:27 -04:00
def cloneNode ( self ) - > treebuilder_base . Node : # type:ignore
2012-02-16 16:33:40 -05:00
tag = self . soup . new_tag ( self . element . name , self . namespace )
2012-02-15 08:27:39 -05:00
node = Element ( tag , self . soup , self . namespace )
2025-01-02 15:21:32 -05:00
for key , value in self . attributes :
2011-02-27 16:51:56 -05:00
node . attributes [ key ] = value
return node
2024-02-01 13:15:55 -05:00
def getNameTuple ( self ) - > Tuple [ Optional [ _NamespaceURL ] , str ] :
2025-01-02 15:31:09 -05:00
if self . namespace is None :
2011-02-27 16:51:56 -05:00
return namespaces [ " html " ] , self . name
else :
return self . namespace , self . name
2025-01-02 15:21:32 -05:00
2011-02-27 16:51:56 -05:00
nameTuple = property ( getNameTuple )
2025-01-02 15:21:32 -05:00
2024-02-01 13:15:55 -05:00
class TextNode ( BeautifulSoupNode ) :
2025-01-02 15:21:32 -05:00
element : NavigableString
2024-02-01 13:15:55 -05:00
2025-01-02 15:21:32 -05:00
def __init__ ( self , element : NavigableString , soup : " BeautifulSoup " ) :
2016-07-16 22:28:40 -04:00
treebuilder_base . Node . __init__ ( self , None )
2011-02-27 16:51:56 -05:00
self . element = element
self . soup = soup