diff --git a/src/byteb4rb1e/utils/http/parser.py b/src/byteb4rb1e/utils/http/parser.py new file mode 100644 index 0000000..58f082f --- /dev/null +++ b/src/byteb4rb1e/utils/http/parser.py @@ -0,0 +1,364 @@ +from __future__ import annotations +from html.parser import HTMLParser +import re +from typing import Dict, Iterable, List, Optional, Generator, Union + + +class Node: + """ + Represents a node in a simple DOM-like tree. + + :param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes. + :param attrs: Iterable of ``(key, value)`` attribute pairs. + :param parent: Parent :class:`Node` instance. + :param text: Text content for text nodes. + + .. todo:: + + Mutation APIs (append_child, remove, replace_with) + """ + + def __init__( + self, + tag: Optional[str] = None, + attrs: Optional[Iterable[tuple[str, str]]] = None, + parent: Optional["Node"] = None, + text: str = "", + ) -> None: + self.tag: Optional[str] = tag + self.attrs: Dict[str, str] = dict(attrs or []) + self.parent: Optional["Node"] = parent + self.children: List["Node"] = [] + self.text: str = text + + def __repr__(self) -> str: + return f"" + + # ---------------------------------------------------------------------- + # Tree traversal + # ---------------------------------------------------------------------- + def iter(self) -> Generator["Node", None, None]: + """ + Recursively yield all descendant nodes. + + :return: Generator of :class:`Node` objects. + """ + for child in self.children: + yield child + yield from child.iter() + + # ---------------------------------------------------------------------- + # DOM-like lookup helpers + # ---------------------------------------------------------------------- + def get_elements_by_tag_name(self, tag: str) -> List["Node"]: + """ + Return all descendant elements with the given tag name. + + :param tag: Tag name to match. + :return: List of :class:`Node` objects. + """ + return [n for n in self.iter() if n.tag == tag] + + def get_elements_by_class_name(self, class_name: str) -> List["Node"]: + """ + Return all descendant elements that contain the given CSS class. + + :param class_name: Class name to match. + :return: List of :class:`Node` objects. + """ + return [ + n + for n in self.iter() + if "class" in n.attrs and class_name in n.attrs["class"].split() + ] + + def get_element_by_id(self, element_id: str) -> Optional["Node"]: + """ + Return the first descendant element with the given ``id`` attribute. + + :param element_id: ID value to match. + :return: :class:`Node` or ``None``. + """ + for n in self.iter(): + if n.attrs.get("id") == element_id: + return n + return None + + def get_elements_by_attribute( + self, attr: str, value: Optional[str] = None + ) -> List["Node"]: + """ + Return all descendant elements matching an attribute. + + :param attr: Attribute name. + :param value: Optional value to match. If ``None``, only the presence + of the attribute is checked. + :return: List of :class:`Node` objects. + """ + if value is None: + return [n for n in self.iter() if attr in n.attrs] + return [n for n in self.iter() if n.attrs.get(attr) == value] + + # ---------------------------------------------------------------------- + # CSS selector engine (supports chaining) + # ---------------------------------------------------------------------- + def query_selector(self, selector: str) -> Optional["Node"]: + """ + Return the first element matching a CSS-like selector. + + Supports: + - ``tag`` + - ``.class`` + - ``#id`` + + :param selector: CSS selector string. + :return: :class:`Node` or ``None``. + """ + results = self.query_selector_all(selector) + return results[0] if results else None + + def query_selector_all(self, selector: str) -> List["Node"]: + # Tokenize: split on spaces and > while keeping > + tokens = re.findall(r"[^\s>]+|>", selector) + + # Current working set starts with the context node + current = [self] + + # Helper: match a node against a simple selector + def match_simple(node: "Node", token: str) -> bool: + tag = None + id_ = None + classes = [] + attrs = {} + + # [attr=value] + attr_matches = re.findall( + r"\[([a-zA-Z0-9_-]+)=['\"]?([^'\"]+)['\"]?\]", + token + ) + for k, v in attr_matches: + attrs[k] = v + token = re.sub(r"\[[^\]]+\]", "", token) + + # tag + m = re.match(r"^[a-zA-Z0-9_-]+", token) + if m: + tag = m.group(0) + token = token[len(tag):] + + # #id + m = re.search(r"#([a-zA-Z0-9_-]+)", token) + if m: + id_ = m.group(1) + token = token.replace("#" + id_, "") + + # .classes + classes = [c for c in token.split(".") if c] + + # match + if tag and node.tag != tag: + return False + if id_ and node.attrs.get("id") != id_: + return False + for cls in classes: + if "class" not in node.attrs or cls not in node.attrs["class"].split(): + return False + for k, v in attrs.items(): + if node.attrs.get(k) != v: + return False + + return True + + # ------------------------------------------------------------ + # Main selector evaluation + # ------------------------------------------------------------ + first_token = True + i = 0 + + while i < len(tokens): + token = tokens[i] + + # -------------------------------------------------------- + # Direct child selector + # -------------------------------------------------------- + if token == ">": + i += 1 + next_token = tokens[i] + next_nodes = [] + + for node in current: + for child in node.children: + if match_simple(child, next_token): + next_nodes.append(child) + + current = next_nodes + first_token = False + i += 1 + continue + + # -------------------------------------------------------- + # Descendant selector + # -------------------------------------------------------- + next_nodes = [] + seen = set() + + for node in current: + # Only include the context node itself if NOT the first token + if not first_token and match_simple(node, token): + if id(node) not in seen: + seen.add(id(node)) + next_nodes.append(node) + + # Always include descendants + for desc in node.iter(): + if match_simple(desc, token): + if id(desc) not in seen: + seen.add(id(desc)) + next_nodes.append(desc) + + current = next_nodes + first_token = False + i += 1 + + return current + + def xpath(self, expr: str) -> List["Node"]: + """ + Very small XPath subset: + + - ``//tag`` + - ``tag/subtag`` + - ``//tag[@attr="value"]`` + + :param expr: XPath-like expression. + :return: List of :class:`Node` objects. + + .. todo:: + + full XPath 1.0 subset + """ + expr = expr.strip() + parts = expr.split("/") + current: List[Node] = [self] + + def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]: + out: List[Node] = [] + for n in nodes: + candidates = n.iter() + for c in candidates: + if tag != "*" and c.tag != tag: + continue + if attr: + if c.attrs.get(attr) == val: + out.append(c) + else: + out.append(c) + return out + + i = 0 + while i < len(parts): + part = parts[i] + if not part: + i += 1 + continue + + # //tag + if part.startswith("//"): + tag = part[2:] + attr = val = None + + if "[" in tag: + tag, rest = tag.split("[", 1) + rest = rest.rstrip("]") + attr, val = rest.split("=") + attr = attr.strip("@") + val = val.strip('"').strip("'") + + current = match(current, tag, attr, val) + i += 1 + continue + + # tag[@attr="value"] + if "[" in part: + tag, rest = part.split("[", 1) + rest = rest.rstrip("]") + attr, val = rest.split("=") + attr = attr.strip("@") + val = val.strip('"').strip("'") + current = match(current, tag, attr, val) + else: + current = match(current, part, None, None) + + i += 1 + + return current + + @property + def inner_content(self) -> str: + """ + Return the concatenated text content of this node and all descendants. + + :return: String containing all text content. + """ + parts: List[str] = [] + if self.text: + parts.append(self.text) + for c in self.children: + parts.append(c.inner_content) + return "".join(parts) + + def outer_html(self) -> str: + """ + Reconstruct the HTML for this node and its subtree. + + :return: HTML string. + """ + if self.tag is None: + return self.text + + attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items()) + inner = "".join(child.outer_html() for child in self.children) + return f"<{self.tag}{attrs}>{inner}" + + def pretty(self, indent: int = 0) -> str: + """ + Return a pretty-printed representation of the DOM tree. + + :param indent: Current indentation level. + :return: Multiline string. + """ + pad = " " * indent + if self.tag is None: + return f"{pad}{self.text!r}" + + attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items()) + header = f"{pad}<{self.tag} {attrs}>".rstrip() + + lines = [header] + for child in self.children: + lines.append(child.pretty(indent + 1)) + return "\n".join(lines) + + +class TreeBuilder(HTMLParser): + """ + HTML parser that constructs a simple DOM-like tree of :class:`Node` objects. + """ + + def __init__(self) -> None: + super().__init__() + self.root: Node = Node(tag="__root__") + self.current: Node = self.root + + def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None: + node = Node(tag=tag, attrs=attrs, parent=self.current) + self.current.children.append(node) + self.current = node + + def handle_endtag(self, tag: str) -> None: + if self.current.parent is not None: + self.current = self.current.parent + + def handle_data(self, data: str) -> None: + if data.strip(): + self.current.children.append(Node(text=data, parent=self.current)) diff --git a/tests/unit/byteb4rb1e/utils/http/parser/__init__.py b/tests/unit/byteb4rb1e/utils/http/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/byteb4rb1e/utils/http/parser/test_node.py b/tests/unit/byteb4rb1e/utils/http/parser/test_node.py new file mode 100644 index 0000000..a75de46 --- /dev/null +++ b/tests/unit/byteb4rb1e/utils/http/parser/test_node.py @@ -0,0 +1,134 @@ +import pytest + +from byteb4rb1e.utils.http.parser import Node, TreeBuilder + + +@pytest.fixture +def sample_dom(): + """ + Build a small DOM tree for testing: + """ + html = """ +
+

Hello

+ World +
+ Inside + Inside Too +
+
+ """ + parser = TreeBuilder() + parser.feed(html) + return parser.root.children[0] # the
+ + +class TestGetElementsByTagName: + def test_find_all_spans(self, sample_dom): + spans = sample_dom.get_elements_by_tag_name("span") + assert len(spans) == 3 + assert spans[0].tag == "span" + assert spans[1].tag == "span" + assert spans[2].tag == "span" + + def test_find_no_matches(self, sample_dom): + assert sample_dom.get_elements_by_tag_name("table") == [] + + +class TestGetElementsByClassName: + def test_find_single_class(self, sample_dom): + items = sample_dom.get_elements_by_class_name("text") + assert len(items) == 2 + + def test_find_multiple_classes(self, sample_dom): + items = sample_dom.get_elements_by_class_name("highlight") + assert len(items) == 1 + assert items[0].tag == "span" + + def test_no_such_class(self, sample_dom): + assert sample_dom.get_elements_by_class_name("missing") == [] + + +class TestGetElementById: + def test_find_existing_id(self, sample_dom): + node = sample_dom.get_element_by_id("inner") + assert node is not None + assert node.tag == "span" + assert node.inner_content == "Inside" + + def test_missing_id(self, sample_dom): + assert sample_dom.get_element_by_id("nope") is None + + +class TestQuerySelectorAll: + def test_class_selector(self, sample_dom): + items = sample_dom.query_selector_all(".text") + assert len(items) == 2 + + def test_id_selector(self, sample_dom): + items = sample_dom.query_selector_all("#inner") + assert len(items) == 1 + assert items[0].inner_content == "Inside" + + def test_tag_selector(self, sample_dom): + items = sample_dom.query_selector_all("p") + assert len(items) == 1 + assert items[0].inner_content == "Hello" + + def test_chained_selector(self, sample_dom): + items = sample_dom.query_selector_all(".text .highlight") + assert len(items) == 1 + assert items[0].inner_content == "World" + + def test_direct_child(self, sample_dom): + items = sample_dom.query_selector_all(".box > #inner") + assert len(items) == 1 + assert items[0].inner_content == "Inside" + + def test_direct_child_no_match(self, sample_dom): + items = sample_dom.query_selector_all("div > span.highlight") + # highlight span is NOT a direct child of inner div + assert len(items) == 0 + + def test_attribute_match(self, sample_dom): + items = sample_dom.query_selector_all('[id="inner"]') + assert len(items) == 1 + assert items[0].inner_content == "Inside" + + def test_attribute_no_match(self, sample_dom): + items = sample_dom.query_selector_all('[data-x="nope"]') + assert items == [] + + def test_tag_class(self, sample_dom): + items = sample_dom.query_selector_all("span.highlight") + assert len(items) == 1 + assert items[0].inner_content == "World" + + def test_multiple_classes(self, sample_dom): + items = sample_dom.query_selector_all(".text.highlight") + assert len(items) == 1 + assert items[0].inner_content == "World" + + def test_tag_id_class(self, sample_dom): + items = sample_dom.query_selector_all("span#inner") + assert len(items) == 1 + assert items[0].inner_content == "Inside" + + def test_descendant(self, sample_dom): + items = sample_dom.query_selector_all("div span") + assert len(items) == 2 + + +class TestXPath: + def test_simple_tag(self, sample_dom): + spans = sample_dom.xpath("//span") + assert len(spans) == 3 + + def test_attribute_match(self, sample_dom): + nodes = sample_dom.xpath('//span[@id="inner"]') + assert len(nodes) == 1 + assert nodes[0].inner_content == "Inside" + + def test_nested(self, sample_dom): + nodes = sample_dom.xpath("//div[@class='box']") + assert len(nodes) == 1