feature(http): init parser
This commit is contained in:
parent
cc4b567181
commit
db72017810
3 changed files with 398 additions and 0 deletions
296
src/byteb4rb1e/utils/http/parser.py
Normal file
296
src/byteb4rb1e/utils/http/parser.py
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
from __future__ import annotations
|
||||
from html.parser import HTMLParser
|
||||
from typing import Dict, Iterable, List, Optional, Generator, Union
|
||||
|
||||
|
||||
class Node:
|
||||
"""
|
||||
Represents a node in a simple DOM-like tree.
|
||||
|
||||
:param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes.
|
||||
:param attrs: Iterable of ``(key, value)`` attribute pairs.
|
||||
:param parent: Parent :class:`Node` instance.
|
||||
:param text: Text content for text nodes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tag: Optional[str] = None,
|
||||
attrs: Optional[Iterable[tuple[str, str]]] = None,
|
||||
parent: Optional["Node"] = None,
|
||||
text: str = "",
|
||||
) -> None:
|
||||
self.tag: Optional[str] = tag
|
||||
self.attrs: Dict[str, str] = dict(attrs or [])
|
||||
self.parent: Optional["Node"] = parent
|
||||
self.children: List["Node"] = []
|
||||
self.text: str = text
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Node {self.tag} {self.attrs} children={len(self.children)}>"
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Tree traversal
|
||||
# ----------------------------------------------------------------------
|
||||
def iter(self) -> Generator["Node", None, None]:
|
||||
"""
|
||||
Recursively yield all descendant nodes.
|
||||
|
||||
:return: Generator of :class:`Node` objects.
|
||||
"""
|
||||
for child in self.children:
|
||||
yield child
|
||||
yield from child.iter()
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# DOM-like lookup helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def get_elements_by_tag_name(self, tag: str) -> List["Node"]:
|
||||
"""
|
||||
Return all descendant elements with the given tag name.
|
||||
|
||||
:param tag: Tag name to match.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
return [n for n in self.iter() if n.tag == tag]
|
||||
|
||||
def get_elements_by_class_name(self, class_name: str) -> List["Node"]:
|
||||
"""
|
||||
Return all descendant elements that contain the given CSS class.
|
||||
|
||||
:param class_name: Class name to match.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
return [
|
||||
n
|
||||
for n in self.iter()
|
||||
if "class" in n.attrs and class_name in n.attrs["class"].split()
|
||||
]
|
||||
|
||||
def get_element_by_id(self, element_id: str) -> Optional["Node"]:
|
||||
"""
|
||||
Return the first descendant element with the given ``id`` attribute.
|
||||
|
||||
:param element_id: ID value to match.
|
||||
:return: :class:`Node` or ``None``.
|
||||
"""
|
||||
for n in self.iter():
|
||||
if n.attrs.get("id") == element_id:
|
||||
return n
|
||||
return None
|
||||
|
||||
def get_elements_by_attribute(
|
||||
self, attr: str, value: Optional[str] = None
|
||||
) -> List["Node"]:
|
||||
"""
|
||||
Return all descendant elements matching an attribute.
|
||||
|
||||
:param attr: Attribute name.
|
||||
:param value: Optional value to match. If ``None``, only the presence
|
||||
of the attribute is checked.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
if value is None:
|
||||
return [n for n in self.iter() if attr in n.attrs]
|
||||
return [n for n in self.iter() if n.attrs.get(attr) == value]
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# CSS selector engine (supports chaining)
|
||||
# ----------------------------------------------------------------------
|
||||
def query_selector(self, selector: str) -> Optional["Node"]:
|
||||
"""
|
||||
Return the first element matching a CSS-like selector.
|
||||
|
||||
Supports:
|
||||
- ``tag``
|
||||
- ``.class``
|
||||
- ``#id``
|
||||
|
||||
:param selector: CSS selector string.
|
||||
:return: :class:`Node` or ``None``.
|
||||
"""
|
||||
results = self.query_selector_all(selector)
|
||||
return results[0] if results else None
|
||||
|
||||
def query_selector_all(self, selector: str) -> List["Node"]:
|
||||
"""
|
||||
Return all elements matching a CSS-like selector chain.
|
||||
|
||||
Supports:
|
||||
- ``tag``
|
||||
- ``.class``
|
||||
- ``#id``
|
||||
- descendant chaining: ``div .item span``
|
||||
"""
|
||||
parts = selector.split()
|
||||
current: List[Node] = [self]
|
||||
|
||||
for part in parts:
|
||||
next_nodes: List[Node] = []
|
||||
|
||||
for node in current:
|
||||
# Tag selector
|
||||
if not part.startswith(".") and not part.startswith("#"):
|
||||
if node.tag == part:
|
||||
next_nodes.append(node)
|
||||
next_nodes.extend(node.get_elements_by_tag_name(part))
|
||||
continue
|
||||
|
||||
# Class selector
|
||||
if part.startswith("."):
|
||||
cls = part[1:]
|
||||
if "class" in node.attrs and cls in node.attrs["class"].split():
|
||||
next_nodes.append(node)
|
||||
next_nodes.extend(node.get_elements_by_class_name(cls))
|
||||
continue
|
||||
|
||||
# ID selector
|
||||
if part.startswith("#"):
|
||||
ident = part[1:]
|
||||
if node.attrs.get("id") == ident:
|
||||
next_nodes.append(node)
|
||||
found = node.get_element_by_id(ident)
|
||||
if found:
|
||||
next_nodes.append(found)
|
||||
continue
|
||||
|
||||
current = next_nodes
|
||||
|
||||
return current
|
||||
|
||||
def xpath(self, expr: str) -> List["Node"]:
|
||||
"""
|
||||
Very small XPath subset:
|
||||
|
||||
- ``//tag``
|
||||
- ``tag/subtag``
|
||||
- ``//tag[@attr="value"]``
|
||||
|
||||
:param expr: XPath-like expression.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
expr = expr.strip()
|
||||
parts = expr.split("/")
|
||||
current: List[Node] = [self]
|
||||
|
||||
def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]:
|
||||
out: List[Node] = []
|
||||
for n in nodes:
|
||||
candidates = n.iter()
|
||||
for c in candidates:
|
||||
if tag != "*" and c.tag != tag:
|
||||
continue
|
||||
if attr:
|
||||
if c.attrs.get(attr) == val:
|
||||
out.append(c)
|
||||
else:
|
||||
out.append(c)
|
||||
return out
|
||||
|
||||
i = 0
|
||||
while i < len(parts):
|
||||
part = parts[i]
|
||||
if not part:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# //tag
|
||||
if part.startswith("//"):
|
||||
tag = part[2:]
|
||||
attr = val = None
|
||||
|
||||
if "[" in tag:
|
||||
tag, rest = tag.split("[", 1)
|
||||
rest = rest.rstrip("]")
|
||||
attr, val = rest.split("=")
|
||||
attr = attr.strip("@")
|
||||
val = val.strip('"').strip("'")
|
||||
|
||||
current = match(current, tag, attr, val)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# tag[@attr="value"]
|
||||
if "[" in part:
|
||||
tag, rest = part.split("[", 1)
|
||||
rest = rest.rstrip("]")
|
||||
attr, val = rest.split("=")
|
||||
attr = attr.strip("@")
|
||||
val = val.strip('"').strip("'")
|
||||
current = match(current, tag, attr, val)
|
||||
else:
|
||||
current = match(current, part, None, None)
|
||||
|
||||
i += 1
|
||||
|
||||
return current
|
||||
|
||||
@property
|
||||
def inner_content(self) -> str:
|
||||
"""
|
||||
Return the concatenated text content of this node and all descendants.
|
||||
|
||||
:return: String containing all text content.
|
||||
"""
|
||||
parts: List[str] = []
|
||||
if self.text:
|
||||
parts.append(self.text)
|
||||
for c in self.children:
|
||||
parts.append(c.inner_content)
|
||||
return "".join(parts)
|
||||
|
||||
def outer_html(self) -> str:
|
||||
"""
|
||||
Reconstruct the HTML for this node and its subtree.
|
||||
|
||||
:return: HTML string.
|
||||
"""
|
||||
if self.tag is None:
|
||||
return self.text
|
||||
|
||||
attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items())
|
||||
inner = "".join(child.outer_html() for child in self.children)
|
||||
return f"<{self.tag}{attrs}>{inner}</{self.tag}>"
|
||||
|
||||
def pretty(self, indent: int = 0) -> str:
|
||||
"""
|
||||
Return a pretty-printed representation of the DOM tree.
|
||||
|
||||
:param indent: Current indentation level.
|
||||
:return: Multiline string.
|
||||
"""
|
||||
pad = " " * indent
|
||||
if self.tag is None:
|
||||
return f"{pad}{self.text!r}"
|
||||
|
||||
attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items())
|
||||
header = f"{pad}<{self.tag} {attrs}>".rstrip()
|
||||
|
||||
lines = [header]
|
||||
for child in self.children:
|
||||
lines.append(child.pretty(indent + 1))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class TreeBuilder(HTMLParser):
|
||||
"""
|
||||
HTML parser that constructs a simple DOM-like tree of :class:`Node` objects.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.root: Node = Node(tag="__root__")
|
||||
self.current: Node = self.root
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None:
|
||||
node = Node(tag=tag, attrs=attrs, parent=self.current)
|
||||
self.current.children.append(node)
|
||||
self.current = node
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if self.current.parent is not None:
|
||||
self.current = self.current.parent
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if data.strip():
|
||||
self.current.children.append(Node(text=data, parent=self.current))
|
||||
0
tests/unit/byteb4rb1e/utils/http/parser/__init__.py
Normal file
0
tests/unit/byteb4rb1e/utils/http/parser/__init__.py
Normal file
102
tests/unit/byteb4rb1e/utils/http/parser/test_node.py
Normal file
102
tests/unit/byteb4rb1e/utils/http/parser/test_node.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import pytest
|
||||
|
||||
from byteb4rb1e.utils.http.parser import Node, TreeBuilder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dom():
|
||||
"""
|
||||
Build a small DOM tree for testing:
|
||||
|
||||
<div id="root" class="container">
|
||||
<p class="text">Hello</p>
|
||||
<span class="text highlight">World</span>
|
||||
<div class="box">
|
||||
<span id="inner">Inside</span>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
html = """
|
||||
<div id="root" class="container">
|
||||
<p class="text">Hello</p>
|
||||
<span class="text highlight">World</span>
|
||||
<div class="box">
|
||||
<span id="inner">Inside</span>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
parser = TreeBuilder()
|
||||
parser.feed(html)
|
||||
return parser.root.children[0] # the <div id="root">
|
||||
|
||||
|
||||
class TestGetElementsByTagName:
|
||||
def test_find_all_spans(self, sample_dom):
|
||||
spans = sample_dom.get_elements_by_tag_name("span")
|
||||
assert len(spans) == 2
|
||||
assert spans[0].tag == "span"
|
||||
assert spans[1].tag == "span"
|
||||
|
||||
def test_find_no_matches(self, sample_dom):
|
||||
assert sample_dom.get_elements_by_tag_name("table") == []
|
||||
|
||||
|
||||
class TestGetElementsByClassName:
|
||||
def test_find_single_class(self, sample_dom):
|
||||
items = sample_dom.get_elements_by_class_name("text")
|
||||
assert len(items) == 2
|
||||
|
||||
def test_find_multiple_classes(self, sample_dom):
|
||||
items = sample_dom.get_elements_by_class_name("highlight")
|
||||
assert len(items) == 1
|
||||
assert items[0].tag == "span"
|
||||
|
||||
def test_no_such_class(self, sample_dom):
|
||||
assert sample_dom.get_elements_by_class_name("missing") == []
|
||||
|
||||
|
||||
class TestGetElementById:
|
||||
def test_find_existing_id(self, sample_dom):
|
||||
node = sample_dom.get_element_by_id("inner")
|
||||
assert node is not None
|
||||
assert node.tag == "span"
|
||||
assert node.inner_content == "Inside"
|
||||
|
||||
def test_missing_id(self, sample_dom):
|
||||
assert sample_dom.get_element_by_id("nope") is None
|
||||
|
||||
|
||||
class TestQuerySelectorAll:
|
||||
def test_class_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all(".text")
|
||||
assert len(items) == 2
|
||||
|
||||
def test_id_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("#inner")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Inside"
|
||||
|
||||
def test_tag_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("p")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Hello"
|
||||
|
||||
def test_chained_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("div .highlight")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "World"
|
||||
|
||||
|
||||
class TestXPath:
|
||||
def test_simple_tag(self, sample_dom):
|
||||
spans = sample_dom.xpath("//span")
|
||||
assert len(spans) == 2
|
||||
|
||||
def test_attribute_match(self, sample_dom):
|
||||
nodes = sample_dom.xpath('//span[@id="inner"]')
|
||||
assert len(nodes) == 1
|
||||
assert nodes[0].inner_content == "Inside"
|
||||
|
||||
def test_nested(self, sample_dom):
|
||||
nodes = sample_dom.xpath("//div[@class='box']")
|
||||
assert len(nodes) == 1
|
||||
Loading…
Add table
Add a link
Reference in a new issue