Compare commits
2 commits
develop
...
feature/17
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a4e215c69c | ||
|
|
db72017810 |
3 changed files with 498 additions and 0 deletions
364
src/byteb4rb1e/utils/http/parser.py
Normal file
364
src/byteb4rb1e/utils/http/parser.py
Normal file
|
|
@ -0,0 +1,364 @@
|
|||
from __future__ import annotations
|
||||
from html.parser import HTMLParser
|
||||
import re
|
||||
from typing import Dict, Iterable, List, Optional, Generator, Union
|
||||
|
||||
|
||||
class Node:
|
||||
"""
|
||||
Represents a node in a simple DOM-like tree.
|
||||
|
||||
:param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes.
|
||||
:param attrs: Iterable of ``(key, value)`` attribute pairs.
|
||||
:param parent: Parent :class:`Node` instance.
|
||||
:param text: Text content for text nodes.
|
||||
|
||||
.. todo::
|
||||
|
||||
Mutation APIs (append_child, remove, replace_with)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tag: Optional[str] = None,
|
||||
attrs: Optional[Iterable[tuple[str, str]]] = None,
|
||||
parent: Optional["Node"] = None,
|
||||
text: str = "",
|
||||
) -> None:
|
||||
self.tag: Optional[str] = tag
|
||||
self.attrs: Dict[str, str] = dict(attrs or [])
|
||||
self.parent: Optional["Node"] = parent
|
||||
self.children: List["Node"] = []
|
||||
self.text: str = text
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<Node {self.tag} {self.attrs} children={len(self.children)}>"
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Tree traversal
|
||||
# ----------------------------------------------------------------------
|
||||
def iter(self) -> Generator["Node", None, None]:
|
||||
"""
|
||||
Recursively yield all descendant nodes.
|
||||
|
||||
:return: Generator of :class:`Node` objects.
|
||||
"""
|
||||
for child in self.children:
|
||||
yield child
|
||||
yield from child.iter()
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# DOM-like lookup helpers
|
||||
# ----------------------------------------------------------------------
|
||||
def get_elements_by_tag_name(self, tag: str) -> List["Node"]:
|
||||
"""
|
||||
Return all descendant elements with the given tag name.
|
||||
|
||||
:param tag: Tag name to match.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
return [n for n in self.iter() if n.tag == tag]
|
||||
|
||||
def get_elements_by_class_name(self, class_name: str) -> List["Node"]:
|
||||
"""
|
||||
Return all descendant elements that contain the given CSS class.
|
||||
|
||||
:param class_name: Class name to match.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
return [
|
||||
n
|
||||
for n in self.iter()
|
||||
if "class" in n.attrs and class_name in n.attrs["class"].split()
|
||||
]
|
||||
|
||||
def get_element_by_id(self, element_id: str) -> Optional["Node"]:
|
||||
"""
|
||||
Return the first descendant element with the given ``id`` attribute.
|
||||
|
||||
:param element_id: ID value to match.
|
||||
:return: :class:`Node` or ``None``.
|
||||
"""
|
||||
for n in self.iter():
|
||||
if n.attrs.get("id") == element_id:
|
||||
return n
|
||||
return None
|
||||
|
||||
def get_elements_by_attribute(
|
||||
self, attr: str, value: Optional[str] = None
|
||||
) -> List["Node"]:
|
||||
"""
|
||||
Return all descendant elements matching an attribute.
|
||||
|
||||
:param attr: Attribute name.
|
||||
:param value: Optional value to match. If ``None``, only the presence
|
||||
of the attribute is checked.
|
||||
:return: List of :class:`Node` objects.
|
||||
"""
|
||||
if value is None:
|
||||
return [n for n in self.iter() if attr in n.attrs]
|
||||
return [n for n in self.iter() if n.attrs.get(attr) == value]
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# CSS selector engine (supports chaining)
|
||||
# ----------------------------------------------------------------------
|
||||
def query_selector(self, selector: str) -> Optional["Node"]:
|
||||
"""
|
||||
Return the first element matching a CSS-like selector.
|
||||
|
||||
Supports:
|
||||
- ``tag``
|
||||
- ``.class``
|
||||
- ``#id``
|
||||
|
||||
:param selector: CSS selector string.
|
||||
:return: :class:`Node` or ``None``.
|
||||
"""
|
||||
results = self.query_selector_all(selector)
|
||||
return results[0] if results else None
|
||||
|
||||
def query_selector_all(self, selector: str) -> List["Node"]:
|
||||
# Tokenize: split on spaces and > while keeping >
|
||||
tokens = re.findall(r"[^\s>]+|>", selector)
|
||||
|
||||
# Current working set starts with the context node
|
||||
current = [self]
|
||||
|
||||
# Helper: match a node against a simple selector
|
||||
def match_simple(node: "Node", token: str) -> bool:
|
||||
tag = None
|
||||
id_ = None
|
||||
classes = []
|
||||
attrs = {}
|
||||
|
||||
# [attr=value]
|
||||
attr_matches = re.findall(
|
||||
r"\[([a-zA-Z0-9_-]+)=['\"]?([^'\"]+)['\"]?\]",
|
||||
token
|
||||
)
|
||||
for k, v in attr_matches:
|
||||
attrs[k] = v
|
||||
token = re.sub(r"\[[^\]]+\]", "", token)
|
||||
|
||||
# tag
|
||||
m = re.match(r"^[a-zA-Z0-9_-]+", token)
|
||||
if m:
|
||||
tag = m.group(0)
|
||||
token = token[len(tag):]
|
||||
|
||||
# #id
|
||||
m = re.search(r"#([a-zA-Z0-9_-]+)", token)
|
||||
if m:
|
||||
id_ = m.group(1)
|
||||
token = token.replace("#" + id_, "")
|
||||
|
||||
# .classes
|
||||
classes = [c for c in token.split(".") if c]
|
||||
|
||||
# match
|
||||
if tag and node.tag != tag:
|
||||
return False
|
||||
if id_ and node.attrs.get("id") != id_:
|
||||
return False
|
||||
for cls in classes:
|
||||
if "class" not in node.attrs or cls not in node.attrs["class"].split():
|
||||
return False
|
||||
for k, v in attrs.items():
|
||||
if node.attrs.get(k) != v:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Main selector evaluation
|
||||
# ------------------------------------------------------------
|
||||
first_token = True
|
||||
i = 0
|
||||
|
||||
while i < len(tokens):
|
||||
token = tokens[i]
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Direct child selector
|
||||
# --------------------------------------------------------
|
||||
if token == ">":
|
||||
i += 1
|
||||
next_token = tokens[i]
|
||||
next_nodes = []
|
||||
|
||||
for node in current:
|
||||
for child in node.children:
|
||||
if match_simple(child, next_token):
|
||||
next_nodes.append(child)
|
||||
|
||||
current = next_nodes
|
||||
first_token = False
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Descendant selector
|
||||
# --------------------------------------------------------
|
||||
next_nodes = []
|
||||
seen = set()
|
||||
|
||||
for node in current:
|
||||
# Only include the context node itself if NOT the first token
|
||||
if not first_token and match_simple(node, token):
|
||||
if id(node) not in seen:
|
||||
seen.add(id(node))
|
||||
next_nodes.append(node)
|
||||
|
||||
# Always include descendants
|
||||
for desc in node.iter():
|
||||
if match_simple(desc, token):
|
||||
if id(desc) not in seen:
|
||||
seen.add(id(desc))
|
||||
next_nodes.append(desc)
|
||||
|
||||
current = next_nodes
|
||||
first_token = False
|
||||
i += 1
|
||||
|
||||
return current
|
||||
|
||||
def xpath(self, expr: str) -> List["Node"]:
|
||||
"""
|
||||
Very small XPath subset:
|
||||
|
||||
- ``//tag``
|
||||
- ``tag/subtag``
|
||||
- ``//tag[@attr="value"]``
|
||||
|
||||
:param expr: XPath-like expression.
|
||||
:return: List of :class:`Node` objects.
|
||||
|
||||
.. todo::
|
||||
|
||||
full XPath 1.0 subset
|
||||
"""
|
||||
expr = expr.strip()
|
||||
parts = expr.split("/")
|
||||
current: List[Node] = [self]
|
||||
|
||||
def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]:
|
||||
out: List[Node] = []
|
||||
for n in nodes:
|
||||
candidates = n.iter()
|
||||
for c in candidates:
|
||||
if tag != "*" and c.tag != tag:
|
||||
continue
|
||||
if attr:
|
||||
if c.attrs.get(attr) == val:
|
||||
out.append(c)
|
||||
else:
|
||||
out.append(c)
|
||||
return out
|
||||
|
||||
i = 0
|
||||
while i < len(parts):
|
||||
part = parts[i]
|
||||
if not part:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# //tag
|
||||
if part.startswith("//"):
|
||||
tag = part[2:]
|
||||
attr = val = None
|
||||
|
||||
if "[" in tag:
|
||||
tag, rest = tag.split("[", 1)
|
||||
rest = rest.rstrip("]")
|
||||
attr, val = rest.split("=")
|
||||
attr = attr.strip("@")
|
||||
val = val.strip('"').strip("'")
|
||||
|
||||
current = match(current, tag, attr, val)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# tag[@attr="value"]
|
||||
if "[" in part:
|
||||
tag, rest = part.split("[", 1)
|
||||
rest = rest.rstrip("]")
|
||||
attr, val = rest.split("=")
|
||||
attr = attr.strip("@")
|
||||
val = val.strip('"').strip("'")
|
||||
current = match(current, tag, attr, val)
|
||||
else:
|
||||
current = match(current, part, None, None)
|
||||
|
||||
i += 1
|
||||
|
||||
return current
|
||||
|
||||
@property
|
||||
def inner_content(self) -> str:
|
||||
"""
|
||||
Return the concatenated text content of this node and all descendants.
|
||||
|
||||
:return: String containing all text content.
|
||||
"""
|
||||
parts: List[str] = []
|
||||
if self.text:
|
||||
parts.append(self.text)
|
||||
for c in self.children:
|
||||
parts.append(c.inner_content)
|
||||
return "".join(parts)
|
||||
|
||||
def outer_html(self) -> str:
|
||||
"""
|
||||
Reconstruct the HTML for this node and its subtree.
|
||||
|
||||
:return: HTML string.
|
||||
"""
|
||||
if self.tag is None:
|
||||
return self.text
|
||||
|
||||
attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items())
|
||||
inner = "".join(child.outer_html() for child in self.children)
|
||||
return f"<{self.tag}{attrs}>{inner}</{self.tag}>"
|
||||
|
||||
def pretty(self, indent: int = 0) -> str:
|
||||
"""
|
||||
Return a pretty-printed representation of the DOM tree.
|
||||
|
||||
:param indent: Current indentation level.
|
||||
:return: Multiline string.
|
||||
"""
|
||||
pad = " " * indent
|
||||
if self.tag is None:
|
||||
return f"{pad}{self.text!r}"
|
||||
|
||||
attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items())
|
||||
header = f"{pad}<{self.tag} {attrs}>".rstrip()
|
||||
|
||||
lines = [header]
|
||||
for child in self.children:
|
||||
lines.append(child.pretty(indent + 1))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class TreeBuilder(HTMLParser):
|
||||
"""
|
||||
HTML parser that constructs a simple DOM-like tree of :class:`Node` objects.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.root: Node = Node(tag="__root__")
|
||||
self.current: Node = self.root
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None:
|
||||
node = Node(tag=tag, attrs=attrs, parent=self.current)
|
||||
self.current.children.append(node)
|
||||
self.current = node
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if self.current.parent is not None:
|
||||
self.current = self.current.parent
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if data.strip():
|
||||
self.current.children.append(Node(text=data, parent=self.current))
|
||||
0
tests/unit/byteb4rb1e/utils/http/parser/__init__.py
Normal file
0
tests/unit/byteb4rb1e/utils/http/parser/__init__.py
Normal file
134
tests/unit/byteb4rb1e/utils/http/parser/test_node.py
Normal file
134
tests/unit/byteb4rb1e/utils/http/parser/test_node.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
import pytest
|
||||
|
||||
from byteb4rb1e.utils.http.parser import Node, TreeBuilder
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dom():
|
||||
"""
|
||||
Build a small DOM tree for testing:
|
||||
"""
|
||||
html = """
|
||||
<div id="root" class="container">
|
||||
<p class="text">Hello</p>
|
||||
<span class="text highlight">World</span>
|
||||
<div class="box">
|
||||
<span id="inner">Inside</span>
|
||||
<span id="inner2">Inside Too</span>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
parser = TreeBuilder()
|
||||
parser.feed(html)
|
||||
return parser.root.children[0] # the <div id="root">
|
||||
|
||||
|
||||
class TestGetElementsByTagName:
|
||||
def test_find_all_spans(self, sample_dom):
|
||||
spans = sample_dom.get_elements_by_tag_name("span")
|
||||
assert len(spans) == 3
|
||||
assert spans[0].tag == "span"
|
||||
assert spans[1].tag == "span"
|
||||
assert spans[2].tag == "span"
|
||||
|
||||
def test_find_no_matches(self, sample_dom):
|
||||
assert sample_dom.get_elements_by_tag_name("table") == []
|
||||
|
||||
|
||||
class TestGetElementsByClassName:
|
||||
def test_find_single_class(self, sample_dom):
|
||||
items = sample_dom.get_elements_by_class_name("text")
|
||||
assert len(items) == 2
|
||||
|
||||
def test_find_multiple_classes(self, sample_dom):
|
||||
items = sample_dom.get_elements_by_class_name("highlight")
|
||||
assert len(items) == 1
|
||||
assert items[0].tag == "span"
|
||||
|
||||
def test_no_such_class(self, sample_dom):
|
||||
assert sample_dom.get_elements_by_class_name("missing") == []
|
||||
|
||||
|
||||
class TestGetElementById:
|
||||
def test_find_existing_id(self, sample_dom):
|
||||
node = sample_dom.get_element_by_id("inner")
|
||||
assert node is not None
|
||||
assert node.tag == "span"
|
||||
assert node.inner_content == "Inside"
|
||||
|
||||
def test_missing_id(self, sample_dom):
|
||||
assert sample_dom.get_element_by_id("nope") is None
|
||||
|
||||
|
||||
class TestQuerySelectorAll:
|
||||
def test_class_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all(".text")
|
||||
assert len(items) == 2
|
||||
|
||||
def test_id_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("#inner")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Inside"
|
||||
|
||||
def test_tag_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("p")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Hello"
|
||||
|
||||
def test_chained_selector(self, sample_dom):
|
||||
items = sample_dom.query_selector_all(".text .highlight")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "World"
|
||||
|
||||
def test_direct_child(self, sample_dom):
|
||||
items = sample_dom.query_selector_all(".box > #inner")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Inside"
|
||||
|
||||
def test_direct_child_no_match(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("div > span.highlight")
|
||||
# highlight span is NOT a direct child of inner div
|
||||
assert len(items) == 0
|
||||
|
||||
def test_attribute_match(self, sample_dom):
|
||||
items = sample_dom.query_selector_all('[id="inner"]')
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Inside"
|
||||
|
||||
def test_attribute_no_match(self, sample_dom):
|
||||
items = sample_dom.query_selector_all('[data-x="nope"]')
|
||||
assert items == []
|
||||
|
||||
def test_tag_class(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("span.highlight")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "World"
|
||||
|
||||
def test_multiple_classes(self, sample_dom):
|
||||
items = sample_dom.query_selector_all(".text.highlight")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "World"
|
||||
|
||||
def test_tag_id_class(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("span#inner")
|
||||
assert len(items) == 1
|
||||
assert items[0].inner_content == "Inside"
|
||||
|
||||
def test_descendant(self, sample_dom):
|
||||
items = sample_dom.query_selector_all("div span")
|
||||
assert len(items) == 2
|
||||
|
||||
|
||||
class TestXPath:
|
||||
def test_simple_tag(self, sample_dom):
|
||||
spans = sample_dom.xpath("//span")
|
||||
assert len(spans) == 3
|
||||
|
||||
def test_attribute_match(self, sample_dom):
|
||||
nodes = sample_dom.xpath('//span[@id="inner"]')
|
||||
assert len(nodes) == 1
|
||||
assert nodes[0].inner_content == "Inside"
|
||||
|
||||
def test_nested(self, sample_dom):
|
||||
nodes = sample_dom.xpath("//div[@class='box']")
|
||||
assert len(nodes) == 1
|
||||
Loading…
Add table
Add a link
Reference in a new issue