From db720178103e7e699fd62d622396f39034a27650 Mon Sep 17 00:00:00 2001 From: Tiara Rodney Date: Wed, 31 Dec 2025 14:34:28 +0100 Subject: [PATCH] feature(http): init parser --- src/byteb4rb1e/utils/http/parser.py | 296 ++++++++++++++++++ .../byteb4rb1e/utils/http/parser/__init__.py | 0 .../byteb4rb1e/utils/http/parser/test_node.py | 102 ++++++ 3 files changed, 398 insertions(+) create mode 100644 src/byteb4rb1e/utils/http/parser.py create mode 100644 tests/unit/byteb4rb1e/utils/http/parser/__init__.py create mode 100644 tests/unit/byteb4rb1e/utils/http/parser/test_node.py diff --git a/src/byteb4rb1e/utils/http/parser.py b/src/byteb4rb1e/utils/http/parser.py new file mode 100644 index 0000000..3f814d6 --- /dev/null +++ b/src/byteb4rb1e/utils/http/parser.py @@ -0,0 +1,296 @@ +from __future__ import annotations +from html.parser import HTMLParser +from typing import Dict, Iterable, List, Optional, Generator, Union + + +class Node: + """ + Represents a node in a simple DOM-like tree. + + :param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes. + :param attrs: Iterable of ``(key, value)`` attribute pairs. + :param parent: Parent :class:`Node` instance. + :param text: Text content for text nodes. + """ + + def __init__( + self, + tag: Optional[str] = None, + attrs: Optional[Iterable[tuple[str, str]]] = None, + parent: Optional["Node"] = None, + text: str = "", + ) -> None: + self.tag: Optional[str] = tag + self.attrs: Dict[str, str] = dict(attrs or []) + self.parent: Optional["Node"] = parent + self.children: List["Node"] = [] + self.text: str = text + + def __repr__(self) -> str: + return f"" + + # ---------------------------------------------------------------------- + # Tree traversal + # ---------------------------------------------------------------------- + def iter(self) -> Generator["Node", None, None]: + """ + Recursively yield all descendant nodes. + + :return: Generator of :class:`Node` objects. + """ + for child in self.children: + yield child + yield from child.iter() + + # ---------------------------------------------------------------------- + # DOM-like lookup helpers + # ---------------------------------------------------------------------- + def get_elements_by_tag_name(self, tag: str) -> List["Node"]: + """ + Return all descendant elements with the given tag name. + + :param tag: Tag name to match. + :return: List of :class:`Node` objects. + """ + return [n for n in self.iter() if n.tag == tag] + + def get_elements_by_class_name(self, class_name: str) -> List["Node"]: + """ + Return all descendant elements that contain the given CSS class. + + :param class_name: Class name to match. + :return: List of :class:`Node` objects. + """ + return [ + n + for n in self.iter() + if "class" in n.attrs and class_name in n.attrs["class"].split() + ] + + def get_element_by_id(self, element_id: str) -> Optional["Node"]: + """ + Return the first descendant element with the given ``id`` attribute. + + :param element_id: ID value to match. + :return: :class:`Node` or ``None``. + """ + for n in self.iter(): + if n.attrs.get("id") == element_id: + return n + return None + + def get_elements_by_attribute( + self, attr: str, value: Optional[str] = None + ) -> List["Node"]: + """ + Return all descendant elements matching an attribute. + + :param attr: Attribute name. + :param value: Optional value to match. If ``None``, only the presence + of the attribute is checked. + :return: List of :class:`Node` objects. + """ + if value is None: + return [n for n in self.iter() if attr in n.attrs] + return [n for n in self.iter() if n.attrs.get(attr) == value] + + # ---------------------------------------------------------------------- + # CSS selector engine (supports chaining) + # ---------------------------------------------------------------------- + def query_selector(self, selector: str) -> Optional["Node"]: + """ + Return the first element matching a CSS-like selector. + + Supports: + - ``tag`` + - ``.class`` + - ``#id`` + + :param selector: CSS selector string. + :return: :class:`Node` or ``None``. + """ + results = self.query_selector_all(selector) + return results[0] if results else None + + def query_selector_all(self, selector: str) -> List["Node"]: + """ + Return all elements matching a CSS-like selector chain. + + Supports: + - ``tag`` + - ``.class`` + - ``#id`` + - descendant chaining: ``div .item span`` + """ + parts = selector.split() + current: List[Node] = [self] + + for part in parts: + next_nodes: List[Node] = [] + + for node in current: + # Tag selector + if not part.startswith(".") and not part.startswith("#"): + if node.tag == part: + next_nodes.append(node) + next_nodes.extend(node.get_elements_by_tag_name(part)) + continue + + # Class selector + if part.startswith("."): + cls = part[1:] + if "class" in node.attrs and cls in node.attrs["class"].split(): + next_nodes.append(node) + next_nodes.extend(node.get_elements_by_class_name(cls)) + continue + + # ID selector + if part.startswith("#"): + ident = part[1:] + if node.attrs.get("id") == ident: + next_nodes.append(node) + found = node.get_element_by_id(ident) + if found: + next_nodes.append(found) + continue + + current = next_nodes + + return current + + def xpath(self, expr: str) -> List["Node"]: + """ + Very small XPath subset: + + - ``//tag`` + - ``tag/subtag`` + - ``//tag[@attr="value"]`` + + :param expr: XPath-like expression. + :return: List of :class:`Node` objects. + """ + expr = expr.strip() + parts = expr.split("/") + current: List[Node] = [self] + + def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]: + out: List[Node] = [] + for n in nodes: + candidates = n.iter() + for c in candidates: + if tag != "*" and c.tag != tag: + continue + if attr: + if c.attrs.get(attr) == val: + out.append(c) + else: + out.append(c) + return out + + i = 0 + while i < len(parts): + part = parts[i] + if not part: + i += 1 + continue + + # //tag + if part.startswith("//"): + tag = part[2:] + attr = val = None + + if "[" in tag: + tag, rest = tag.split("[", 1) + rest = rest.rstrip("]") + attr, val = rest.split("=") + attr = attr.strip("@") + val = val.strip('"').strip("'") + + current = match(current, tag, attr, val) + i += 1 + continue + + # tag[@attr="value"] + if "[" in part: + tag, rest = part.split("[", 1) + rest = rest.rstrip("]") + attr, val = rest.split("=") + attr = attr.strip("@") + val = val.strip('"').strip("'") + current = match(current, tag, attr, val) + else: + current = match(current, part, None, None) + + i += 1 + + return current + + @property + def inner_content(self) -> str: + """ + Return the concatenated text content of this node and all descendants. + + :return: String containing all text content. + """ + parts: List[str] = [] + if self.text: + parts.append(self.text) + for c in self.children: + parts.append(c.inner_content) + return "".join(parts) + + def outer_html(self) -> str: + """ + Reconstruct the HTML for this node and its subtree. + + :return: HTML string. + """ + if self.tag is None: + return self.text + + attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items()) + inner = "".join(child.outer_html() for child in self.children) + return f"<{self.tag}{attrs}>{inner}" + + def pretty(self, indent: int = 0) -> str: + """ + Return a pretty-printed representation of the DOM tree. + + :param indent: Current indentation level. + :return: Multiline string. + """ + pad = " " * indent + if self.tag is None: + return f"{pad}{self.text!r}" + + attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items()) + header = f"{pad}<{self.tag} {attrs}>".rstrip() + + lines = [header] + for child in self.children: + lines.append(child.pretty(indent + 1)) + return "\n".join(lines) + + +class TreeBuilder(HTMLParser): + """ + HTML parser that constructs a simple DOM-like tree of :class:`Node` objects. + """ + + def __init__(self) -> None: + super().__init__() + self.root: Node = Node(tag="__root__") + self.current: Node = self.root + + def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None: + node = Node(tag=tag, attrs=attrs, parent=self.current) + self.current.children.append(node) + self.current = node + + def handle_endtag(self, tag: str) -> None: + if self.current.parent is not None: + self.current = self.current.parent + + def handle_data(self, data: str) -> None: + if data.strip(): + self.current.children.append(Node(text=data, parent=self.current)) diff --git a/tests/unit/byteb4rb1e/utils/http/parser/__init__.py b/tests/unit/byteb4rb1e/utils/http/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/byteb4rb1e/utils/http/parser/test_node.py b/tests/unit/byteb4rb1e/utils/http/parser/test_node.py new file mode 100644 index 0000000..721821f --- /dev/null +++ b/tests/unit/byteb4rb1e/utils/http/parser/test_node.py @@ -0,0 +1,102 @@ +import pytest + +from byteb4rb1e.utils.http.parser import Node, TreeBuilder + + +@pytest.fixture +def sample_dom(): + """ + Build a small DOM tree for testing: + +
+

Hello

+ World +
+ Inside +
+
+ """ + html = """ +
+

Hello

+ World +
+ Inside +
+
+ """ + parser = TreeBuilder() + parser.feed(html) + return parser.root.children[0] # the
+ + +class TestGetElementsByTagName: + def test_find_all_spans(self, sample_dom): + spans = sample_dom.get_elements_by_tag_name("span") + assert len(spans) == 2 + assert spans[0].tag == "span" + assert spans[1].tag == "span" + + def test_find_no_matches(self, sample_dom): + assert sample_dom.get_elements_by_tag_name("table") == [] + + +class TestGetElementsByClassName: + def test_find_single_class(self, sample_dom): + items = sample_dom.get_elements_by_class_name("text") + assert len(items) == 2 + + def test_find_multiple_classes(self, sample_dom): + items = sample_dom.get_elements_by_class_name("highlight") + assert len(items) == 1 + assert items[0].tag == "span" + + def test_no_such_class(self, sample_dom): + assert sample_dom.get_elements_by_class_name("missing") == [] + + +class TestGetElementById: + def test_find_existing_id(self, sample_dom): + node = sample_dom.get_element_by_id("inner") + assert node is not None + assert node.tag == "span" + assert node.inner_content == "Inside" + + def test_missing_id(self, sample_dom): + assert sample_dom.get_element_by_id("nope") is None + + +class TestQuerySelectorAll: + def test_class_selector(self, sample_dom): + items = sample_dom.query_selector_all(".text") + assert len(items) == 2 + + def test_id_selector(self, sample_dom): + items = sample_dom.query_selector_all("#inner") + assert len(items) == 1 + assert items[0].inner_content == "Inside" + + def test_tag_selector(self, sample_dom): + items = sample_dom.query_selector_all("p") + assert len(items) == 1 + assert items[0].inner_content == "Hello" + + def test_chained_selector(self, sample_dom): + items = sample_dom.query_selector_all("div .highlight") + assert len(items) == 1 + assert items[0].inner_content == "World" + + +class TestXPath: + def test_simple_tag(self, sample_dom): + spans = sample_dom.xpath("//span") + assert len(spans) == 2 + + def test_attribute_match(self, sample_dom): + nodes = sample_dom.xpath('//span[@id="inner"]') + assert len(nodes) == 1 + assert nodes[0].inner_content == "Inside" + + def test_nested(self, sample_dom): + nodes = sample_dom.xpath("//div[@class='box']") + assert len(nodes) == 1