Pyjo.DOM - Minimalistic HTML/XML DOM parser with CSS selectors

import Pyjo.DOM

# Parse
dom = Pyjo.DOM.new('<div><p id="a">Test</p><p id="b">123</p></div>')

# Find
print(dom.at('#b').text)
print(dom.find('p').map('text').join("\n"))
dom.find('[id]').map('attr', 'id').join("\n")

# Iterate
dom.find('p[id]').reverse().each(lambda i: print(i.attr('id')))

# Loop
for i in dom.find('p[id]').each():
    print(i.attr('id') + ':' + i.text)

# Modify
dom.find('div p').last().append('<p id="c">456</p>')
dom.find(':not(p)').map('strip')

# Render
print(dom)

Pyjo.DOM is a minimalistic and relaxed HTML/XML DOM parser with CSS selector support. It will even try to interpret broken HTML and XML, so you should not use it for validation.

Classes

class Pyjo.DOM.Pyjo_DOM(html=None)

Pyjo.DOM inherits all attributes and methods from Pyjo.Base and Pyjo.String.Mixin and implements the following new ones.

__init__(html=None)
dom = Pyjo.DOM.new()

Construct a new Pyjo.DOM object.

all_raw_text
untrimmed = dom.all_raw_text

Extract all text content from DOM structure, smart whitespace trimming is disabled.

# "foo\nbar baz\n"
dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').all_raw_text
all_text
trimmed   = dom.all_text

Extract all text content from DOM structure, smart whitespace trimming is enabled.

# "foo bar baz"
dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').all_text
ancestors(pattern=None)
collection = dom.ancestors()
collection = dom.ancestors('div > p')

Find all ancestors of this node matching the CSS selector and return a Pyjo.Collection object containing these elements as Pyjo.DOM objects. All selectors from Pyjo.DOM.CSS are supported.

# "div > p > i"
dom.parse('<div><p><i>bar</i></p></div>').at('i').child_nodes[0] \
   .ancestors() \
   .map('tag').reverse().join(" > ").say()
append(string)
dom = dom.append(u'<p>I ♥ Pyjo!</p>')

Append HTML/XML fragment to this node.

# "<div><h1>Test</h1><h2>123</h2></div>"
dom.parse('<div><h1>Test</h1></div>') \
   .at('h1').append('<h2>123</h2>').root

# "<p>Test 123</p>"
dom.parse('<p>Test</p>').at('p').child_nodes.first().append(' 123').root
append_content(string)
dom = dom.append_content(u'<p>I ♥ Pyjo!</p>')

Append HTML/XML fragment (for root and tag nodes) or raw content to this node’s content.

# "<div><h1>Test123</h1></div>"
dom.parse('<div><h1>Test</h1></div>') \
   .at('h1').append_content('123').root

# "<!-- Test 123 --><br>"
dom.parse('<!-- Test --><br>') \
   .child_nodes.first().append_content('123 ').root

# "<p>Test<i>123</i></p>"
dom.parse('<p>Test</p>').at('p').append_content('<i>123</i>').root
at(pattern)
result = dom.at('div > p')

Find first element in DOM structure matching the CSS selector and return it as a Pyjo.DOM object or return None if none could be found. All selectors from Pyjo.DOM.CSS are supported.

# Find first element with ``svg`` namespace definition
namespace = dom.at('[xmlns\:svg]').attr('xmlns:svg')
attr(*args, **kwargs)
my_dict = dom.attr()
foo = dom.attr('foo')
dom = dom.attr('foo', 'bar')
dom = dom.attr(foo='bar')

This element’s attributes. Returns None if attribute is missing. Setting value to None deletes attribute.

# List id attributes
dom.parse('<div id="a">foo</div><p>bar</p><div id="b">baz</div>') \
   .find('*').map('attr', 'id').compact().join("\n").say()
child_nodes
collection = dom.child_nodes

Return a Pyjo.Collection object containing the child nodes of this element as Pyjo.DOM objects.

# "<p><b>123</b></p>"
dom.parse('<p>Test<b>123</b></p>').at('p').child_nodes.first().remove()

# "<!-- Test -->"
dom.parse('<!-- Test --><b>123</b>').child_nodes.first()
children(pattern=None)
collection = dom.children()
collection = dom.children('div > p')

Find all children of this element matching the CSS selector and return a Pyjo.Collection object containing these elements as Pyjo.DOM objects. All selectors from Pyjo.DOM.CSS are supported.

# Show type of random child element
print(dom.parse('<b>foo</b><i>bar</i><p>baz</p>') \
    .children().shuffle().first().type)
content
string = dom.content
dom.content = u'<p>I ♥ Pyjo!</p>'

Return this node’s content or replace it with HTML/XML fragment (for root and tag nodes) or raw content.

# "<b>Test</b>"
dom.parse('<div><b>Test</b></div>').at('div').content

# "<div><h1>123</h1></div>"
dom.parse('<div><h1>Test</h1></div>').at('h1').set(content='123').root

# "<p><i>123</i></p>"
dom.parse('<p>Test</p>').at('p').set(content='<i>123</i>').root

# "<div><h1></h1></div>"
dom.parse('<div><h1>Test</h1></div>').at('h1').set(content='').root

# " Test "
dom.parse('<!-- Test --><br>').child_nodes.first().content

# "<div><!-- 123 -->456</div>"
dom.parse('<div><!-- Test -->456</div>') \
   .at('div').child_nodes.first().set(content=' 123 ').root
descendant_nodes
collection = dom.descendant_nodes

Return a Pyjo.Collection object containing all descendant nodes of this element as Pyjo.DOM objects.

# "<p><b>123</b></p>"
dom.parse('<p><!-- Test --><b>123<!-- 456 --></b></p>') \
   .descendant_nodes \
   .grep(lambda i: i.type == 'comment').map('remove').first()
find(pattern)
collection = dom.find('div > p')

Find all elements in DOM structure matching the CSS selector and return a Pyjo.Collection object containing these elements as Pyjo.DOM objects. All selectors from Pyjo.DOM.CSS are supported.

# Find a specific element and extract information
div_id = dom.find('div')[2].attr('id')

# Extract information from multiple elements
headers = dom.find('h1, h2, h3').map('text').to_list()

# Find elements with a class that contains dots
divs = dom.find('div.foo\.bar').to_list()
following(pattern=None)
collection = dom.following()
collection = dom.following('div > p')

Find all sibling elements after this node matching the CSS selector and return a Pyjo.Collection object containing these elements as Pyjo.DOM objects. All selectors from Pyjo.DOM.CSS are supported.

# List types of sibling elements before this node
dom.parse('<b>foo</b><i>bar</i><p>baz</p>').at('b') \
   .following().map('tag').join("\n").say()
following_nodes()
::
collection = dom.following_nodes()

Return a Pyjo.Collection object containing all sibling nodes after this node as Pyjo.DOM objects.

# "C"
dom.parse('<p>A</p><!-- B -->C')
   .at('p').following_nodes().last().content
matches(pattern)
result = dom.matches('div > p')

Check if this element matches the CSS selector. All selectors from Pyjo.DOM.CSS are supported.

# True
bool(dom.parse('<p class="a">A</p>').at('p').matches('.a'))
bool(dom.parse('<p class="a">A</p>').at('p').matches('p[class]'))

# False
bool(dom.parse('<p class="a">A</p>').at('p').matches('.b'))
bool(dom.parse('<p class="a">A</p>').at('p').matches('p[id]'))
namespace
namespace = dom.namespace

Find this element’s namespace or return None if none could be found.

# Find namespace for an element with namespace prefix
namespace = dom.at('svg > svg\:circle').namespace

# Find namespace for an element that may or may not have a namespace prefix
namespace = dom.at('svg > circle').namespace
next
sibling = dom.next

Return Pyjo.DOM object for next sibling element or None if there are no more siblings.

# "<h2>123</h2>"
dom.parse('<div><h1>Test</h1><h2>123</h2></div>').at('h1').next
next_node
sibling = dom.next_node

Return Pyjo.DOM object for next sibling node or None if there are no more siblings.

# "456"
dom.parse('<p><b>123</b><!-- Test -->456</p>') \
   .at('b').next_node.next_node

# " Test "
dom.parse('<p><b>123</b><!-- Test -->456</p>') \
   .at('b').next_node.content
parent
parent = dom.parent

Return Pyjo.DOM object for parent of this node or None if this node has no parent.

parse(html)
dom = dom.parse(u'<foo bar="baz">I ♥ Pyjo!</foo>')

Parse HTML/XML fragment with Pyjo.DOM.HTML.

# Parse XML
dom = Pyjo.DOM.new().set(xml=True).parse(xml)
preceding(pattern=None)
collection = dom.preceding()
collection = dom.preceding('div > p')

Find all sibling elements before this node matching the CSS selector and return a Pyjo.Collection object containing these elements as Pyjo.DOM objects. All selectors from Pyjo.DOM.CSS are supported.

# List types of sibling elements before this node
dom.preceding().map('tag').join("\n").say()
preceding_nodes()
::
collection = dom.preceding_nodes()

Return a Pyjo.Collection object containing all sibling nodes before this node as Pyjo.DOM objects.

# "A"
dom.parse('A<!-- B --><p>C</p>') \
   .at('p').preceding_nodes().first().content
prepend(string)
dom = dom.prepend(u'<p>I ♥ Pyjo!</p>')

Prepend HTML/XML fragment to this node.

# "<div><h1>123</h1><h2>Test</h2></div>"
dom.parse('<div><h2>Test</h2></div>') \
   .at('h2').prepend('<h1>123</h1>').root

# "<p>Test 123</p>"
dom.parse('<p>123</p>').at('p').child_nodes.first().prepend('Test ').root
prepend_content(string)
dom = dom.prepend_content(u'<p>I ♥ Pyjo!</p>')

Prepend HTML/XML fragment (for root and tag nodes) or raw content to this node’s content.

# "<div><h2>Test 123</h2></div>"
dom.parse('<div><h2>123</h2></div>') \
   .at('h2').prepend_content('Test ').root

# "<!-- Test 123 --><br>"
dom.parse('<!-- 123 --><br>') \
   .child_nodes.first().prepend_content(' Test').root

# "<p><i>123</i>Test</p>"
dom.parse('<p>Test</p>').at('p').prepend_content('<i>123</i>').root
previous
sibling = dom.previous

Return Pyjo.DOM object for previous sibling element or None if there are no more siblings.

# "<h1>Test</h1>"
dom.parse('<div><h1>Test</h1><h2>123</h2></div>').at('h2').previous
previous_node
sibling = dom.previous_node

Return Pyjo.DOM object for previous sibling node or None if there are no more siblings.

# "123"
dom.parse('<p>123<!-- Test --><b>456</b></p>') \
   .at('b').previous_node.previous_node

# " Test "
dom.parse('<p>123<!-- Test --><b>456</b></p>') \
   .at('b').previous_node.content
raw_text
untrimmed = dom.raw_text

Extract text content from this element only (not including child elements), smart whitespace trimming is disabled.

# "foo\nbaz\n"
dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').raw_text
remove()
parent = dom.remove()

Remove this node and return parent().

# "<div></div>"
dom.parse('<div><h1>Test</h1></div>').at('h1').remove()

# "<p><b>456</b></p>"
dom.parse('<p>123<b>456</b></p>').at('p').child_nodes.first().remove().root
replace(new)
parent = dom.replace(u'<div>I ♥ Pyjo!</div>')

Replace this node with HTML/XML fragment and return parent().

# "<div><h2>123</h2></div>"
dom.parse('<div><h1>Test</h1></div>').at('h1').replace('<h2>123</h2>')

# "<p><b>123</b></p>"
dom.parse('<p>Test</p>') \
   .at('p').child_nodes.item(0).replace('<b>123</b>').root
root
root = dom.root

Return Pyjo.DOM object for root node.

strip()
parent = dom.strip()

Remove this element while preserving its content and return parent().

# "<div>Test</div>"
dom.parse('<div><h1>Test</h1></div>').at('h1').strip()
tag
tag = dom.tag
dom.tag = 'div'

This element’s tag name.

# List tag names of child elements
dom.children().map('tag').join("\n").say()
text
trimmed = dom.text

Extract text content from this element only (not including child elements), smart whitespace trimming is enabled.

# "foo baz"
dom.parse("<div>foo\n<p>bar</p>baz\n</div>").at('div').text
type
nodetype = dom.type

This node’s type, usually cdata, comment, doctype, pi, raw, root, tag or text.

wrap(string)
dom = dom.wrap('<div></div>')

Wrap HTML/XML fragment around this node, placing it as the last child of the first innermost element.

# "<p>123<b>Test</b></p>"
dom.parse('<b>Test</b>').at('b').wrap('<p>123</p>').root

# "<div><p><b>Test</b></p>123</div>"
dom.parse('<b>Test</b>').at('b').wrap('<div><p></p>123</div>').root

# "<p><b>Test</b></p><p>123</p>"
dom.parse('<b>Test</b>').at('b').wrap('<p></p><p>123</p>').root

# "<p><b>Test</b></p>"
dom.parse('<p>Test</p>').at('p').child_nodes.first().wrap('<b>').root
wrap_content(string)
dom = dom.wrap_content('<div></div>')

Wrap HTML/XML fragment around this node’s content, placing it as the last children of the first innermost element.

# "<p><b>123Test</b></p>"
dom.parse('<p>Test</p>').at('p').wrap_content('<b>123</b>').root

# "<p><b>Test</b></p><p>123</p>"
dom.parse('<b>Test</b>').wrap_content('<p></p><p>123</p>')
Pyjo.DOM.object

alias of Pyjo_DOM