feature(http): init parser
This commit is contained in:
parent
cc4b567181
commit
db72017810
3 changed files with 398 additions and 0 deletions
296
src/byteb4rb1e/utils/http/parser.py
Normal file
296
src/byteb4rb1e/utils/http/parser.py
Normal file
|
|
@ -0,0 +1,296 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from typing import Dict, Iterable, List, Optional, Generator, Union
|
||||||
|
|
||||||
|
|
||||||
|
class Node:
|
||||||
|
"""
|
||||||
|
Represents a node in a simple DOM-like tree.
|
||||||
|
|
||||||
|
:param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes.
|
||||||
|
:param attrs: Iterable of ``(key, value)`` attribute pairs.
|
||||||
|
:param parent: Parent :class:`Node` instance.
|
||||||
|
:param text: Text content for text nodes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
tag: Optional[str] = None,
|
||||||
|
attrs: Optional[Iterable[tuple[str, str]]] = None,
|
||||||
|
parent: Optional["Node"] = None,
|
||||||
|
text: str = "",
|
||||||
|
) -> None:
|
||||||
|
self.tag: Optional[str] = tag
|
||||||
|
self.attrs: Dict[str, str] = dict(attrs or [])
|
||||||
|
self.parent: Optional["Node"] = parent
|
||||||
|
self.children: List["Node"] = []
|
||||||
|
self.text: str = text
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<Node {self.tag} {self.attrs} children={len(self.children)}>"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Tree traversal
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
def iter(self) -> Generator["Node", None, None]:
|
||||||
|
"""
|
||||||
|
Recursively yield all descendant nodes.
|
||||||
|
|
||||||
|
:return: Generator of :class:`Node` objects.
|
||||||
|
"""
|
||||||
|
for child in self.children:
|
||||||
|
yield child
|
||||||
|
yield from child.iter()
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# DOM-like lookup helpers
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
def get_elements_by_tag_name(self, tag: str) -> List["Node"]:
|
||||||
|
"""
|
||||||
|
Return all descendant elements with the given tag name.
|
||||||
|
|
||||||
|
:param tag: Tag name to match.
|
||||||
|
:return: List of :class:`Node` objects.
|
||||||
|
"""
|
||||||
|
return [n for n in self.iter() if n.tag == tag]
|
||||||
|
|
||||||
|
def get_elements_by_class_name(self, class_name: str) -> List["Node"]:
|
||||||
|
"""
|
||||||
|
Return all descendant elements that contain the given CSS class.
|
||||||
|
|
||||||
|
:param class_name: Class name to match.
|
||||||
|
:return: List of :class:`Node` objects.
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
n
|
||||||
|
for n in self.iter()
|
||||||
|
if "class" in n.attrs and class_name in n.attrs["class"].split()
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_element_by_id(self, element_id: str) -> Optional["Node"]:
|
||||||
|
"""
|
||||||
|
Return the first descendant element with the given ``id`` attribute.
|
||||||
|
|
||||||
|
:param element_id: ID value to match.
|
||||||
|
:return: :class:`Node` or ``None``.
|
||||||
|
"""
|
||||||
|
for n in self.iter():
|
||||||
|
if n.attrs.get("id") == element_id:
|
||||||
|
return n
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_elements_by_attribute(
|
||||||
|
self, attr: str, value: Optional[str] = None
|
||||||
|
) -> List["Node"]:
|
||||||
|
"""
|
||||||
|
Return all descendant elements matching an attribute.
|
||||||
|
|
||||||
|
:param attr: Attribute name.
|
||||||
|
:param value: Optional value to match. If ``None``, only the presence
|
||||||
|
of the attribute is checked.
|
||||||
|
:return: List of :class:`Node` objects.
|
||||||
|
"""
|
||||||
|
if value is None:
|
||||||
|
return [n for n in self.iter() if attr in n.attrs]
|
||||||
|
return [n for n in self.iter() if n.attrs.get(attr) == value]
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# CSS selector engine (supports chaining)
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
def query_selector(self, selector: str) -> Optional["Node"]:
|
||||||
|
"""
|
||||||
|
Return the first element matching a CSS-like selector.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- ``tag``
|
||||||
|
- ``.class``
|
||||||
|
- ``#id``
|
||||||
|
|
||||||
|
:param selector: CSS selector string.
|
||||||
|
:return: :class:`Node` or ``None``.
|
||||||
|
"""
|
||||||
|
results = self.query_selector_all(selector)
|
||||||
|
return results[0] if results else None
|
||||||
|
|
||||||
|
def query_selector_all(self, selector: str) -> List["Node"]:
|
||||||
|
"""
|
||||||
|
Return all elements matching a CSS-like selector chain.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- ``tag``
|
||||||
|
- ``.class``
|
||||||
|
- ``#id``
|
||||||
|
- descendant chaining: ``div .item span``
|
||||||
|
"""
|
||||||
|
parts = selector.split()
|
||||||
|
current: List[Node] = [self]
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
next_nodes: List[Node] = []
|
||||||
|
|
||||||
|
for node in current:
|
||||||
|
# Tag selector
|
||||||
|
if not part.startswith(".") and not part.startswith("#"):
|
||||||
|
if node.tag == part:
|
||||||
|
next_nodes.append(node)
|
||||||
|
next_nodes.extend(node.get_elements_by_tag_name(part))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Class selector
|
||||||
|
if part.startswith("."):
|
||||||
|
cls = part[1:]
|
||||||
|
if "class" in node.attrs and cls in node.attrs["class"].split():
|
||||||
|
next_nodes.append(node)
|
||||||
|
next_nodes.extend(node.get_elements_by_class_name(cls))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ID selector
|
||||||
|
if part.startswith("#"):
|
||||||
|
ident = part[1:]
|
||||||
|
if node.attrs.get("id") == ident:
|
||||||
|
next_nodes.append(node)
|
||||||
|
found = node.get_element_by_id(ident)
|
||||||
|
if found:
|
||||||
|
next_nodes.append(found)
|
||||||
|
continue
|
||||||
|
|
||||||
|
current = next_nodes
|
||||||
|
|
||||||
|
return current
|
||||||
|
|
||||||
|
def xpath(self, expr: str) -> List["Node"]:
|
||||||
|
"""
|
||||||
|
Very small XPath subset:
|
||||||
|
|
||||||
|
- ``//tag``
|
||||||
|
- ``tag/subtag``
|
||||||
|
- ``//tag[@attr="value"]``
|
||||||
|
|
||||||
|
:param expr: XPath-like expression.
|
||||||
|
:return: List of :class:`Node` objects.
|
||||||
|
"""
|
||||||
|
expr = expr.strip()
|
||||||
|
parts = expr.split("/")
|
||||||
|
current: List[Node] = [self]
|
||||||
|
|
||||||
|
def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]:
|
||||||
|
out: List[Node] = []
|
||||||
|
for n in nodes:
|
||||||
|
candidates = n.iter()
|
||||||
|
for c in candidates:
|
||||||
|
if tag != "*" and c.tag != tag:
|
||||||
|
continue
|
||||||
|
if attr:
|
||||||
|
if c.attrs.get(attr) == val:
|
||||||
|
out.append(c)
|
||||||
|
else:
|
||||||
|
out.append(c)
|
||||||
|
return out
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(parts):
|
||||||
|
part = parts[i]
|
||||||
|
if not part:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# //tag
|
||||||
|
if part.startswith("//"):
|
||||||
|
tag = part[2:]
|
||||||
|
attr = val = None
|
||||||
|
|
||||||
|
if "[" in tag:
|
||||||
|
tag, rest = tag.split("[", 1)
|
||||||
|
rest = rest.rstrip("]")
|
||||||
|
attr, val = rest.split("=")
|
||||||
|
attr = attr.strip("@")
|
||||||
|
val = val.strip('"').strip("'")
|
||||||
|
|
||||||
|
current = match(current, tag, attr, val)
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# tag[@attr="value"]
|
||||||
|
if "[" in part:
|
||||||
|
tag, rest = part.split("[", 1)
|
||||||
|
rest = rest.rstrip("]")
|
||||||
|
attr, val = rest.split("=")
|
||||||
|
attr = attr.strip("@")
|
||||||
|
val = val.strip('"').strip("'")
|
||||||
|
current = match(current, tag, attr, val)
|
||||||
|
else:
|
||||||
|
current = match(current, part, None, None)
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return current
|
||||||
|
|
||||||
|
@property
|
||||||
|
def inner_content(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the concatenated text content of this node and all descendants.
|
||||||
|
|
||||||
|
:return: String containing all text content.
|
||||||
|
"""
|
||||||
|
parts: List[str] = []
|
||||||
|
if self.text:
|
||||||
|
parts.append(self.text)
|
||||||
|
for c in self.children:
|
||||||
|
parts.append(c.inner_content)
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
|
def outer_html(self) -> str:
|
||||||
|
"""
|
||||||
|
Reconstruct the HTML for this node and its subtree.
|
||||||
|
|
||||||
|
:return: HTML string.
|
||||||
|
"""
|
||||||
|
if self.tag is None:
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items())
|
||||||
|
inner = "".join(child.outer_html() for child in self.children)
|
||||||
|
return f"<{self.tag}{attrs}>{inner}</{self.tag}>"
|
||||||
|
|
||||||
|
def pretty(self, indent: int = 0) -> str:
|
||||||
|
"""
|
||||||
|
Return a pretty-printed representation of the DOM tree.
|
||||||
|
|
||||||
|
:param indent: Current indentation level.
|
||||||
|
:return: Multiline string.
|
||||||
|
"""
|
||||||
|
pad = " " * indent
|
||||||
|
if self.tag is None:
|
||||||
|
return f"{pad}{self.text!r}"
|
||||||
|
|
||||||
|
attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items())
|
||||||
|
header = f"{pad}<{self.tag} {attrs}>".rstrip()
|
||||||
|
|
||||||
|
lines = [header]
|
||||||
|
for child in self.children:
|
||||||
|
lines.append(child.pretty(indent + 1))
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
class TreeBuilder(HTMLParser):
|
||||||
|
"""
|
||||||
|
HTML parser that constructs a simple DOM-like tree of :class:`Node` objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.root: Node = Node(tag="__root__")
|
||||||
|
self.current: Node = self.root
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None:
|
||||||
|
node = Node(tag=tag, attrs=attrs, parent=self.current)
|
||||||
|
self.current.children.append(node)
|
||||||
|
self.current = node
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str) -> None:
|
||||||
|
if self.current.parent is not None:
|
||||||
|
self.current = self.current.parent
|
||||||
|
|
||||||
|
def handle_data(self, data: str) -> None:
|
||||||
|
if data.strip():
|
||||||
|
self.current.children.append(Node(text=data, parent=self.current))
|
||||||
0
tests/unit/byteb4rb1e/utils/http/parser/__init__.py
Normal file
0
tests/unit/byteb4rb1e/utils/http/parser/__init__.py
Normal file
102
tests/unit/byteb4rb1e/utils/http/parser/test_node.py
Normal file
102
tests/unit/byteb4rb1e/utils/http/parser/test_node.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from byteb4rb1e.utils.http.parser import Node, TreeBuilder
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_dom():
|
||||||
|
"""
|
||||||
|
Build a small DOM tree for testing:
|
||||||
|
|
||||||
|
<div id="root" class="container">
|
||||||
|
<p class="text">Hello</p>
|
||||||
|
<span class="text highlight">World</span>
|
||||||
|
<div class="box">
|
||||||
|
<span id="inner">Inside</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
html = """
|
||||||
|
<div id="root" class="container">
|
||||||
|
<p class="text">Hello</p>
|
||||||
|
<span class="text highlight">World</span>
|
||||||
|
<div class="box">
|
||||||
|
<span id="inner">Inside</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
parser = TreeBuilder()
|
||||||
|
parser.feed(html)
|
||||||
|
return parser.root.children[0] # the <div id="root">
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetElementsByTagName:
|
||||||
|
def test_find_all_spans(self, sample_dom):
|
||||||
|
spans = sample_dom.get_elements_by_tag_name("span")
|
||||||
|
assert len(spans) == 2
|
||||||
|
assert spans[0].tag == "span"
|
||||||
|
assert spans[1].tag == "span"
|
||||||
|
|
||||||
|
def test_find_no_matches(self, sample_dom):
|
||||||
|
assert sample_dom.get_elements_by_tag_name("table") == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetElementsByClassName:
|
||||||
|
def test_find_single_class(self, sample_dom):
|
||||||
|
items = sample_dom.get_elements_by_class_name("text")
|
||||||
|
assert len(items) == 2
|
||||||
|
|
||||||
|
def test_find_multiple_classes(self, sample_dom):
|
||||||
|
items = sample_dom.get_elements_by_class_name("highlight")
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].tag == "span"
|
||||||
|
|
||||||
|
def test_no_such_class(self, sample_dom):
|
||||||
|
assert sample_dom.get_elements_by_class_name("missing") == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetElementById:
|
||||||
|
def test_find_existing_id(self, sample_dom):
|
||||||
|
node = sample_dom.get_element_by_id("inner")
|
||||||
|
assert node is not None
|
||||||
|
assert node.tag == "span"
|
||||||
|
assert node.inner_content == "Inside"
|
||||||
|
|
||||||
|
def test_missing_id(self, sample_dom):
|
||||||
|
assert sample_dom.get_element_by_id("nope") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestQuerySelectorAll:
|
||||||
|
def test_class_selector(self, sample_dom):
|
||||||
|
items = sample_dom.query_selector_all(".text")
|
||||||
|
assert len(items) == 2
|
||||||
|
|
||||||
|
def test_id_selector(self, sample_dom):
|
||||||
|
items = sample_dom.query_selector_all("#inner")
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].inner_content == "Inside"
|
||||||
|
|
||||||
|
def test_tag_selector(self, sample_dom):
|
||||||
|
items = sample_dom.query_selector_all("p")
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].inner_content == "Hello"
|
||||||
|
|
||||||
|
def test_chained_selector(self, sample_dom):
|
||||||
|
items = sample_dom.query_selector_all("div .highlight")
|
||||||
|
assert len(items) == 1
|
||||||
|
assert items[0].inner_content == "World"
|
||||||
|
|
||||||
|
|
||||||
|
class TestXPath:
|
||||||
|
def test_simple_tag(self, sample_dom):
|
||||||
|
spans = sample_dom.xpath("//span")
|
||||||
|
assert len(spans) == 2
|
||||||
|
|
||||||
|
def test_attribute_match(self, sample_dom):
|
||||||
|
nodes = sample_dom.xpath('//span[@id="inner"]')
|
||||||
|
assert len(nodes) == 1
|
||||||
|
assert nodes[0].inner_content == "Inside"
|
||||||
|
|
||||||
|
def test_nested(self, sample_dom):
|
||||||
|
nodes = sample_dom.xpath("//div[@class='box']")
|
||||||
|
assert len(nodes) == 1
|
||||||
Loading…
Add table
Add a link
Reference in a new issue