Pyjo.DOM - Minimalistic HTML/XML DOM parser with CSS selectors¶
import Pyjo.DOM
# Parse
dom = Pyjo.DOM.new('<div><p id="a">Test</p><p id="b">123</p></div>')
# Find
print(dom.at('#b').text)
print(dom.find('p').map('text').join("\n"))
dom.find('[id]').map('attr', 'id').join("\n")
# Iterate
dom.find('p[id]').reverse().each(lambda i: print(i.attr('id')))
# Loop
for i in dom.find('p[id]').each():
print(i.attr('id') + ':' + i.text)
# Modify
dom.find('div p').last().append('<p id="c">456</p>')
dom.find(':not(p)').map('strip')
# Render
print(dom)
Pyjo.DOM
is a minimalistic and relaxed HTML/XML DOM parser with CSS
selector support. It will even try to interpret broken HTML and XML, so you
should not use it for validation.
Classes¶
-
class
Pyjo.DOM.
Pyjo_DOM
(html=None)¶ Pyjo.DOM
inherits all attributes and methods fromPyjo.Base
andPyjo.String.Mixin
and implements the following new ones.-
all_raw_text
¶ untrimmed = dom.all_raw_text
Extract all text content from DOM structure, smart whitespace trimming is disabled.
# "foo\nbar baz\n" dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').all_raw_text
-
all_text
¶ trimmed = dom.all_text
Extract all text content from DOM structure, smart whitespace trimming is enabled.
# "foo bar baz" dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').all_text
-
ancestors
(pattern=None)¶ collection = dom.ancestors() collection = dom.ancestors('div > p')
Find all ancestors of this node matching the CSS selector and return a
Pyjo.Collection
object containing these elements asPyjo.DOM
objects. All selectors fromPyjo.DOM.CSS
are supported.# "div > p > i" dom.parse('<div><p><i>bar</i></p></div>').at('i').child_nodes[0] \ .ancestors() \ .map('tag').reverse().join(" > ").say()
-
append
(string)¶ dom = dom.append(u'<p>I ♥ Pyjo!</p>')
Append HTML/XML fragment to this node.
# "<div><h1>Test</h1><h2>123</h2></div>" dom.parse('<div><h1>Test</h1></div>') \ .at('h1').append('<h2>123</h2>').root # "<p>Test 123</p>" dom.parse('<p>Test</p>').at('p').child_nodes.first().append(' 123').root
-
append_content
(string)¶ dom = dom.append_content(u'<p>I ♥ Pyjo!</p>')
Append HTML/XML fragment (for
root
andtag
nodes) or raw content to this node’s content.# "<div><h1>Test123</h1></div>" dom.parse('<div><h1>Test</h1></div>') \ .at('h1').append_content('123').root # "<!-- Test 123 --><br>" dom.parse('<!-- Test --><br>') \ .child_nodes.first().append_content('123 ').root # "<p>Test<i>123</i></p>" dom.parse('<p>Test</p>').at('p').append_content('<i>123</i>').root
-
at
(pattern)¶ result = dom.at('div > p')
Find first element in DOM structure matching the CSS selector and return it as a
Pyjo.DOM
object or returnNone
if none could be found. All selectors fromPyjo.DOM.CSS
are supported.# Find first element with ``svg`` namespace definition namespace = dom.at('[xmlns\:svg]').attr('xmlns:svg')
-
attr
(*args, **kwargs)¶ my_dict = dom.attr() foo = dom.attr('foo') dom = dom.attr('foo', 'bar') dom = dom.attr(foo='bar')
This element’s attributes. Returns
None
if attribute is missing. Setting value toNone
deletes attribute.# List id attributes dom.parse('<div id="a">foo</div><p>bar</p><div id="b">baz</div>') \ .find('*').map('attr', 'id').compact().join("\n").say()
-
child_nodes
¶ collection = dom.child_nodes
Return a
Pyjo.Collection
object containing the child nodes of this element asPyjo.DOM
objects.# "<p><b>123</b></p>" dom.parse('<p>Test<b>123</b></p>').at('p').child_nodes.first().remove() # "<!-- Test -->" dom.parse('<!-- Test --><b>123</b>').child_nodes.first()
-
children
(pattern=None)¶ collection = dom.children() collection = dom.children('div > p')
Find all children of this element matching the CSS selector and return a
Pyjo.Collection
object containing these elements asPyjo.DOM
objects. All selectors fromPyjo.DOM.CSS
are supported.# Show type of random child element print(dom.parse('<b>foo</b><i>bar</i><p>baz</p>') \ .children().shuffle().first().type)
-
content
¶ string = dom.content dom.content = u'<p>I ♥ Pyjo!</p>'
Return this node’s content or replace it with HTML/XML fragment (for
root
andtag
nodes) or raw content.# "<b>Test</b>" dom.parse('<div><b>Test</b></div>').at('div').content # "<div><h1>123</h1></div>" dom.parse('<div><h1>Test</h1></div>').at('h1').set(content='123').root # "<p><i>123</i></p>" dom.parse('<p>Test</p>').at('p').set(content='<i>123</i>').root # "<div><h1></h1></div>" dom.parse('<div><h1>Test</h1></div>').at('h1').set(content='').root # " Test " dom.parse('<!-- Test --><br>').child_nodes.first().content # "<div><!-- 123 -->456</div>" dom.parse('<div><!-- Test -->456</div>') \ .at('div').child_nodes.first().set(content=' 123 ').root
-
descendant_nodes
¶ collection = dom.descendant_nodes
Return a
Pyjo.Collection
object containing all descendant nodes of this element asPyjo.DOM
objects.# "<p><b>123</b></p>" dom.parse('<p><!-- Test --><b>123<!-- 456 --></b></p>') \ .descendant_nodes \ .grep(lambda i: i.type == 'comment').map('remove').first()
-
find
(pattern)¶ collection = dom.find('div > p')
Find all elements in DOM structure matching the CSS selector and return a
Pyjo.Collection
object containing these elements asPyjo.DOM
objects. All selectors fromPyjo.DOM.CSS
are supported.# Find a specific element and extract information div_id = dom.find('div')[2].attr('id') # Extract information from multiple elements headers = dom.find('h1, h2, h3').map('text').to_list() # Find elements with a class that contains dots divs = dom.find('div.foo\.bar').to_list()
-
following
(pattern=None)¶ collection = dom.following() collection = dom.following('div > p')
Find all sibling elements after this node matching the CSS selector and return a
Pyjo.Collection
object containing these elements asPyjo.DOM
objects. All selectors fromPyjo.DOM.CSS
are supported.# List types of sibling elements before this node dom.parse('<b>foo</b><i>bar</i><p>baz</p>').at('b') \ .following().map('tag').join("\n").say()
-
following_nodes
()¶ - ::
- collection = dom.following_nodes()
Return a
Pyjo.Collection
object containing all sibling nodes after this node asPyjo.DOM
objects.# "C" dom.parse('<p>A</p><!-- B -->C') .at('p').following_nodes().last().content
-
matches
(pattern)¶ result = dom.matches('div > p')
Check if this element matches the CSS selector. All selectors from
Pyjo.DOM.CSS
are supported.# True bool(dom.parse('<p class="a">A</p>').at('p').matches('.a')) bool(dom.parse('<p class="a">A</p>').at('p').matches('p[class]')) # False bool(dom.parse('<p class="a">A</p>').at('p').matches('.b')) bool(dom.parse('<p class="a">A</p>').at('p').matches('p[id]'))
-
namespace
¶ namespace = dom.namespace
Find this element’s namespace or return
None
if none could be found.# Find namespace for an element with namespace prefix namespace = dom.at('svg > svg\:circle').namespace # Find namespace for an element that may or may not have a namespace prefix namespace = dom.at('svg > circle').namespace
-
next
¶ sibling = dom.next
Return
Pyjo.DOM
object for next sibling element orNone
if there are no more siblings.# "<h2>123</h2>" dom.parse('<div><h1>Test</h1><h2>123</h2></div>').at('h1').next
-
next_node
¶ sibling = dom.next_node
Return
Pyjo.DOM
object for next sibling node orNone
if there are no more siblings.# "456" dom.parse('<p><b>123</b><!-- Test -->456</p>') \ .at('b').next_node.next_node # " Test " dom.parse('<p><b>123</b><!-- Test -->456</p>') \ .at('b').next_node.content
-
parent
¶ parent = dom.parent
Return
Pyjo.DOM
object for parent of this node orNone
if this node has no parent.
-
parse
(html)¶ dom = dom.parse(u'<foo bar="baz">I ♥ Pyjo!</foo>')
Parse HTML/XML fragment with
Pyjo.DOM.HTML
.# Parse XML dom = Pyjo.DOM.new().set(xml=True).parse(xml)
-
preceding
(pattern=None)¶ collection = dom.preceding() collection = dom.preceding('div > p')
Find all sibling elements before this node matching the CSS selector and return a
Pyjo.Collection
object containing these elements asPyjo.DOM
objects. All selectors fromPyjo.DOM.CSS
are supported.# List types of sibling elements before this node dom.preceding().map('tag').join("\n").say()
-
preceding_nodes
()¶ - ::
- collection = dom.preceding_nodes()
Return a
Pyjo.Collection
object containing all sibling nodes before this node asPyjo.DOM
objects.# "A" dom.parse('A<!-- B --><p>C</p>') \ .at('p').preceding_nodes().first().content
-
prepend
(string)¶ dom = dom.prepend(u'<p>I ♥ Pyjo!</p>')
Prepend HTML/XML fragment to this node.
# "<div><h1>123</h1><h2>Test</h2></div>" dom.parse('<div><h2>Test</h2></div>') \ .at('h2').prepend('<h1>123</h1>').root # "<p>Test 123</p>" dom.parse('<p>123</p>').at('p').child_nodes.first().prepend('Test ').root
-
prepend_content
(string)¶ dom = dom.prepend_content(u'<p>I ♥ Pyjo!</p>')
Prepend HTML/XML fragment (for
root
andtag
nodes) or raw content to this node’s content.# "<div><h2>Test 123</h2></div>" dom.parse('<div><h2>123</h2></div>') \ .at('h2').prepend_content('Test ').root # "<!-- Test 123 --><br>" dom.parse('<!-- 123 --><br>') \ .child_nodes.first().prepend_content(' Test').root # "<p><i>123</i>Test</p>" dom.parse('<p>Test</p>').at('p').prepend_content('<i>123</i>').root
-
previous
¶ sibling = dom.previous
Return
Pyjo.DOM
object for previous sibling element orNone
if there are no more siblings.# "<h1>Test</h1>" dom.parse('<div><h1>Test</h1><h2>123</h2></div>').at('h2').previous
-
previous_node
¶ sibling = dom.previous_node
Return
Pyjo.DOM
object for previous sibling node orNone
if there are no more siblings.# "123" dom.parse('<p>123<!-- Test --><b>456</b></p>') \ .at('b').previous_node.previous_node # " Test " dom.parse('<p>123<!-- Test --><b>456</b></p>') \ .at('b').previous_node.content
-
raw_text
¶ untrimmed = dom.raw_text
Extract text content from this element only (not including child elements), smart whitespace trimming is disabled.
# "foo\nbaz\n" dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').raw_text
-
remove
()¶ parent = dom.remove()
Remove this node and return
parent()
.# "<div></div>" dom.parse('<div><h1>Test</h1></div>').at('h1').remove() # "<p><b>456</b></p>" dom.parse('<p>123<b>456</b></p>').at('p').child_nodes.first().remove().root
-
replace
(new)¶ parent = dom.replace(u'<div>I ♥ Pyjo!</div>')
Replace this node with HTML/XML fragment and return
parent()
.# "<div><h2>123</h2></div>" dom.parse('<div><h1>Test</h1></div>').at('h1').replace('<h2>123</h2>') # "<p><b>123</b></p>" dom.parse('<p>Test</p>') \ .at('p').child_nodes.item(0).replace('<b>123</b>').root
-
strip
()¶ parent = dom.strip()
Remove this element while preserving its content and return
parent()
.# "<div>Test</div>" dom.parse('<div><h1>Test</h1></div>').at('h1').strip()
-
tag
¶ tag = dom.tag dom.tag = 'div'
This element’s tag name.
# List tag names of child elements dom.children().map('tag').join("\n").say()
-
text
¶ trimmed = dom.text
Extract text content from this element only (not including child elements), smart whitespace trimming is enabled.
# "foo baz" dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').text
-
type
¶ nodetype = dom.type
This node’s type, usually
cdata
,comment
,doctype
,pi
,raw
,root
,tag
ortext
.
-
wrap
(string)¶ dom = dom.wrap('<div></div>')
Wrap HTML/XML fragment around this node, placing it as the last child of the first innermost element.
# "<p>123<b>Test</b></p>" dom.parse('<b>Test</b>').at('b').wrap('<p>123</p>').root # "<div><p><b>Test</b></p>123</div>" dom.parse('<b>Test</b>').at('b').wrap('<div><p></p>123</div>').root # "<p><b>Test</b></p><p>123</p>" dom.parse('<b>Test</b>').at('b').wrap('<p></p><p>123</p>').root # "<p><b>Test</b></p>" dom.parse('<p>Test</p>').at('p').child_nodes.first().wrap('<b>').root
-
wrap_content
(string)¶ dom = dom.wrap_content('<div></div>')
Wrap HTML/XML fragment around this node’s content, placing it as the last children of the first innermost element.
# "<p><b>123Test</b></p>" dom.parse('<p>Test</p>').at('p').wrap_content('<b>123</b>').root # "<p><b>Test</b></p><p>123</p>" dom.parse('<b>Test</b>').wrap_content('<p></p><p>123</p>')
-