feature(http): init parser

This commit is contained in:
Tiara Rodney 2025-12-31 14:34:28 +01:00
parent cc4b567181
commit db72017810
No known key found for this signature in database
GPG key ID: 5CD8EC1D46106723
3 changed files with 398 additions and 0 deletions

View file

@ -0,0 +1,296 @@
from __future__ import annotations
from html.parser import HTMLParser
from typing import Dict, Iterable, List, Optional, Generator, Union
class Node:
"""
Represents a node in a simple DOM-like tree.
:param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes.
:param attrs: Iterable of ``(key, value)`` attribute pairs.
:param parent: Parent :class:`Node` instance.
:param text: Text content for text nodes.
"""
def __init__(
self,
tag: Optional[str] = None,
attrs: Optional[Iterable[tuple[str, str]]] = None,
parent: Optional["Node"] = None,
text: str = "",
) -> None:
self.tag: Optional[str] = tag
self.attrs: Dict[str, str] = dict(attrs or [])
self.parent: Optional["Node"] = parent
self.children: List["Node"] = []
self.text: str = text
def __repr__(self) -> str:
return f"<Node {self.tag} {self.attrs} children={len(self.children)}>"
# ----------------------------------------------------------------------
# Tree traversal
# ----------------------------------------------------------------------
def iter(self) -> Generator["Node", None, None]:
"""
Recursively yield all descendant nodes.
:return: Generator of :class:`Node` objects.
"""
for child in self.children:
yield child
yield from child.iter()
# ----------------------------------------------------------------------
# DOM-like lookup helpers
# ----------------------------------------------------------------------
def get_elements_by_tag_name(self, tag: str) -> List["Node"]:
"""
Return all descendant elements with the given tag name.
:param tag: Tag name to match.
:return: List of :class:`Node` objects.
"""
return [n for n in self.iter() if n.tag == tag]
def get_elements_by_class_name(self, class_name: str) -> List["Node"]:
"""
Return all descendant elements that contain the given CSS class.
:param class_name: Class name to match.
:return: List of :class:`Node` objects.
"""
return [
n
for n in self.iter()
if "class" in n.attrs and class_name in n.attrs["class"].split()
]
def get_element_by_id(self, element_id: str) -> Optional["Node"]:
"""
Return the first descendant element with the given ``id`` attribute.
:param element_id: ID value to match.
:return: :class:`Node` or ``None``.
"""
for n in self.iter():
if n.attrs.get("id") == element_id:
return n
return None
def get_elements_by_attribute(
self, attr: str, value: Optional[str] = None
) -> List["Node"]:
"""
Return all descendant elements matching an attribute.
:param attr: Attribute name.
:param value: Optional value to match. If ``None``, only the presence
of the attribute is checked.
:return: List of :class:`Node` objects.
"""
if value is None:
return [n for n in self.iter() if attr in n.attrs]
return [n for n in self.iter() if n.attrs.get(attr) == value]
# ----------------------------------------------------------------------
# CSS selector engine (supports chaining)
# ----------------------------------------------------------------------
def query_selector(self, selector: str) -> Optional["Node"]:
"""
Return the first element matching a CSS-like selector.
Supports:
- ``tag``
- ``.class``
- ``#id``
:param selector: CSS selector string.
:return: :class:`Node` or ``None``.
"""
results = self.query_selector_all(selector)
return results[0] if results else None
def query_selector_all(self, selector: str) -> List["Node"]:
"""
Return all elements matching a CSS-like selector chain.
Supports:
- ``tag``
- ``.class``
- ``#id``
- descendant chaining: ``div .item span``
"""
parts = selector.split()
current: List[Node] = [self]
for part in parts:
next_nodes: List[Node] = []
for node in current:
# Tag selector
if not part.startswith(".") and not part.startswith("#"):
if node.tag == part:
next_nodes.append(node)
next_nodes.extend(node.get_elements_by_tag_name(part))
continue
# Class selector
if part.startswith("."):
cls = part[1:]
if "class" in node.attrs and cls in node.attrs["class"].split():
next_nodes.append(node)
next_nodes.extend(node.get_elements_by_class_name(cls))
continue
# ID selector
if part.startswith("#"):
ident = part[1:]
if node.attrs.get("id") == ident:
next_nodes.append(node)
found = node.get_element_by_id(ident)
if found:
next_nodes.append(found)
continue
current = next_nodes
return current
def xpath(self, expr: str) -> List["Node"]:
"""
Very small XPath subset:
- ``//tag``
- ``tag/subtag``
- ``//tag[@attr="value"]``
:param expr: XPath-like expression.
:return: List of :class:`Node` objects.
"""
expr = expr.strip()
parts = expr.split("/")
current: List[Node] = [self]
def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]:
out: List[Node] = []
for n in nodes:
candidates = n.iter()
for c in candidates:
if tag != "*" and c.tag != tag:
continue
if attr:
if c.attrs.get(attr) == val:
out.append(c)
else:
out.append(c)
return out
i = 0
while i < len(parts):
part = parts[i]
if not part:
i += 1
continue
# //tag
if part.startswith("//"):
tag = part[2:]
attr = val = None
if "[" in tag:
tag, rest = tag.split("[", 1)
rest = rest.rstrip("]")
attr, val = rest.split("=")
attr = attr.strip("@")
val = val.strip('"').strip("'")
current = match(current, tag, attr, val)
i += 1
continue
# tag[@attr="value"]
if "[" in part:
tag, rest = part.split("[", 1)
rest = rest.rstrip("]")
attr, val = rest.split("=")
attr = attr.strip("@")
val = val.strip('"').strip("'")
current = match(current, tag, attr, val)
else:
current = match(current, part, None, None)
i += 1
return current
@property
def inner_content(self) -> str:
"""
Return the concatenated text content of this node and all descendants.
:return: String containing all text content.
"""
parts: List[str] = []
if self.text:
parts.append(self.text)
for c in self.children:
parts.append(c.inner_content)
return "".join(parts)
def outer_html(self) -> str:
"""
Reconstruct the HTML for this node and its subtree.
:return: HTML string.
"""
if self.tag is None:
return self.text
attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items())
inner = "".join(child.outer_html() for child in self.children)
return f"<{self.tag}{attrs}>{inner}</{self.tag}>"
def pretty(self, indent: int = 0) -> str:
"""
Return a pretty-printed representation of the DOM tree.
:param indent: Current indentation level.
:return: Multiline string.
"""
pad = " " * indent
if self.tag is None:
return f"{pad}{self.text!r}"
attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items())
header = f"{pad}<{self.tag} {attrs}>".rstrip()
lines = [header]
for child in self.children:
lines.append(child.pretty(indent + 1))
return "\n".join(lines)
class TreeBuilder(HTMLParser):
"""
HTML parser that constructs a simple DOM-like tree of :class:`Node` objects.
"""
def __init__(self) -> None:
super().__init__()
self.root: Node = Node(tag="__root__")
self.current: Node = self.root
def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None:
node = Node(tag=tag, attrs=attrs, parent=self.current)
self.current.children.append(node)
self.current = node
def handle_endtag(self, tag: str) -> None:
if self.current.parent is not None:
self.current = self.current.parent
def handle_data(self, data: str) -> None:
if data.strip():
self.current.children.append(Node(text=data, parent=self.current))

View file

@ -0,0 +1,102 @@
import pytest
from byteb4rb1e.utils.http.parser import Node, TreeBuilder
@pytest.fixture
def sample_dom():
"""
Build a small DOM tree for testing:
<div id="root" class="container">
<p class="text">Hello</p>
<span class="text highlight">World</span>
<div class="box">
<span id="inner">Inside</span>
</div>
</div>
"""
html = """
<div id="root" class="container">
<p class="text">Hello</p>
<span class="text highlight">World</span>
<div class="box">
<span id="inner">Inside</span>
</div>
</div>
"""
parser = TreeBuilder()
parser.feed(html)
return parser.root.children[0] # the <div id="root">
class TestGetElementsByTagName:
def test_find_all_spans(self, sample_dom):
spans = sample_dom.get_elements_by_tag_name("span")
assert len(spans) == 2
assert spans[0].tag == "span"
assert spans[1].tag == "span"
def test_find_no_matches(self, sample_dom):
assert sample_dom.get_elements_by_tag_name("table") == []
class TestGetElementsByClassName:
def test_find_single_class(self, sample_dom):
items = sample_dom.get_elements_by_class_name("text")
assert len(items) == 2
def test_find_multiple_classes(self, sample_dom):
items = sample_dom.get_elements_by_class_name("highlight")
assert len(items) == 1
assert items[0].tag == "span"
def test_no_such_class(self, sample_dom):
assert sample_dom.get_elements_by_class_name("missing") == []
class TestGetElementById:
def test_find_existing_id(self, sample_dom):
node = sample_dom.get_element_by_id("inner")
assert node is not None
assert node.tag == "span"
assert node.inner_content == "Inside"
def test_missing_id(self, sample_dom):
assert sample_dom.get_element_by_id("nope") is None
class TestQuerySelectorAll:
def test_class_selector(self, sample_dom):
items = sample_dom.query_selector_all(".text")
assert len(items) == 2
def test_id_selector(self, sample_dom):
items = sample_dom.query_selector_all("#inner")
assert len(items) == 1
assert items[0].inner_content == "Inside"
def test_tag_selector(self, sample_dom):
items = sample_dom.query_selector_all("p")
assert len(items) == 1
assert items[0].inner_content == "Hello"
def test_chained_selector(self, sample_dom):
items = sample_dom.query_selector_all("div .highlight")
assert len(items) == 1
assert items[0].inner_content == "World"
class TestXPath:
def test_simple_tag(self, sample_dom):
spans = sample_dom.xpath("//span")
assert len(spans) == 2
def test_attribute_match(self, sample_dom):
nodes = sample_dom.xpath('//span[@id="inner"]')
assert len(nodes) == 1
assert nodes[0].inner_content == "Inside"
def test_nested(self, sample_dom):
nodes = sample_dom.xpath("//div[@class='box']")
assert len(nodes) == 1