feature(http): init parser

2025-12-31 14:34:28 +01:00 · 2025-12-31 14:34:28 +01:00 · db72017810
commit db72017810
parent cc4b567181
3 changed files with 398 additions and 0 deletions
--- a/src/byteb4rb1e/utils/http/parser.py
+++ b/src/byteb4rb1e/utils/http/parser.py
@ -0,0 +1,296 @@
 from __future__ import annotations
 from html.parser import HTMLParser
 from typing import Dict, Iterable, List, Optional, Generator, Union
 class Node:
    """
    Represents a node in a simple DOM-like tree.
    :param tag: The HTML tag name (e.g., ``"div"``). ``None`` for text nodes.
    :param attrs: Iterable of ``(key, value)`` attribute pairs.
    :param parent: Parent :class:`Node` instance.
    :param text: Text content for text nodes.
    """
    def __init__(
        self,
        tag: Optional[str] = None,
        attrs: Optional[Iterable[tuple[str, str]]] = None,
        parent: Optional["Node"] = None,
        text: str = "",
    ) -> None:
        self.tag: Optional[str] = tag
        self.attrs: Dict[str, str] = dict(attrs or [])
        self.parent: Optional["Node"] = parent
        self.children: List["Node"] = []
        self.text: str = text
    def __repr__(self) -> str:
        return f"<Node {self.tag} {self.attrs} children={len(self.children)}>"
    # ----------------------------------------------------------------------
    # Tree traversal
    # ----------------------------------------------------------------------
    def iter(self) -> Generator["Node", None, None]:
        """
        Recursively yield all descendant nodes.
        :return: Generator of :class:`Node` objects.
        """
        for child in self.children:
            yield child
            yield from child.iter()
    # ----------------------------------------------------------------------
    # DOM-like lookup helpers
    # ----------------------------------------------------------------------
    def get_elements_by_tag_name(self, tag: str) -> List["Node"]:
        """
        Return all descendant elements with the given tag name.
        :param tag: Tag name to match.
        :return: List of :class:`Node` objects.
        """
        return [n for n in self.iter() if n.tag == tag]
    def get_elements_by_class_name(self, class_name: str) -> List["Node"]:
        """
        Return all descendant elements that contain the given CSS class.
        :param class_name: Class name to match.
        :return: List of :class:`Node` objects.
        """
        return [
            n
            for n in self.iter()
            if "class" in n.attrs and class_name in n.attrs["class"].split()
        ]
    def get_element_by_id(self, element_id: str) -> Optional["Node"]:
        """
        Return the first descendant element with the given ``id`` attribute.
        :param element_id: ID value to match.
        :return: :class:`Node` or ``None``.
        """
        for n in self.iter():
            if n.attrs.get("id") == element_id:
                return n
        return None
    def get_elements_by_attribute(
        self, attr: str, value: Optional[str] = None
    ) -> List["Node"]:
        """
        Return all descendant elements matching an attribute.
        :param attr: Attribute name.
        :param value: Optional value to match. If ``None``, only the presence
                      of the attribute is checked.
        :return: List of :class:`Node` objects.
        """
        if value is None:
            return [n for n in self.iter() if attr in n.attrs]
        return [n for n in self.iter() if n.attrs.get(attr) == value]
    # ----------------------------------------------------------------------
    # CSS selector engine (supports chaining)
    # ----------------------------------------------------------------------
    def query_selector(self, selector: str) -> Optional["Node"]:
        """
        Return the first element matching a CSS-like selector.
        Supports:
        - ``tag``
        - ``.class``
        - ``#id``
        :param selector: CSS selector string.
        :return: :class:`Node` or ``None``.
        """
        results = self.query_selector_all(selector)
        return results[0] if results else None
    def query_selector_all(self, selector: str) -> List["Node"]:
        """
        Return all elements matching a CSS-like selector chain.
        Supports:
        - ``tag``
        - ``.class``
        - ``#id``
        - descendant chaining: ``div .item span``
        """
        parts = selector.split()
        current: List[Node] = [self]
        for part in parts:
            next_nodes: List[Node] = []
            for node in current:
                # Tag selector
                if not part.startswith(".") and not part.startswith("#"):
                    if node.tag == part:
                        next_nodes.append(node)
                    next_nodes.extend(node.get_elements_by_tag_name(part))
                    continue
                # Class selector
                if part.startswith("."):
                    cls = part[1:]
                    if "class" in node.attrs and cls in node.attrs["class"].split():
                        next_nodes.append(node)
                    next_nodes.extend(node.get_elements_by_class_name(cls))
                    continue
                # ID selector
                if part.startswith("#"):
                    ident = part[1:]
                    if node.attrs.get("id") == ident:
                        next_nodes.append(node)
                    found = node.get_element_by_id(ident)
                    if found:
                        next_nodes.append(found)
                    continue
            current = next_nodes
        return current
    def xpath(self, expr: str) -> List["Node"]:
        """
        Very small XPath subset:
        - ``//tag``
        - ``tag/subtag``
        - ``//tag[@attr="value"]``
        :param expr: XPath-like expression.
        :return: List of :class:`Node` objects.
        """
        expr = expr.strip()
        parts = expr.split("/")
        current: List[Node] = [self]
        def match(nodes: List[Node], tag: str, attr: Optional[str], val: Optional[str]) -> List[Node]:
            out: List[Node] = []
            for n in nodes:
                candidates = n.iter()
                for c in candidates:
                    if tag != "*" and c.tag != tag:
                        continue
                    if attr:
                        if c.attrs.get(attr) == val:
                            out.append(c)
                    else:
                        out.append(c)
            return out
        i = 0
        while i < len(parts):
            part = parts[i]
            if not part:
                i += 1
                continue
            # //tag
            if part.startswith("//"):
                tag = part[2:]
                attr = val = None
                if "[" in tag:
                    tag, rest = tag.split("[", 1)
                    rest = rest.rstrip("]")
                    attr, val = rest.split("=")
                    attr = attr.strip("@")
                    val = val.strip('"').strip("'")
                current = match(current, tag, attr, val)
                i += 1
                continue
            # tag[@attr="value"]
            if "[" in part:
                tag, rest = part.split("[", 1)
                rest = rest.rstrip("]")
                attr, val = rest.split("=")
                attr = attr.strip("@")
                val = val.strip('"').strip("'")
                current = match(current, tag, attr, val)
            else:
                current = match(current, part, None, None)
            i += 1
        return current
    @property
    def inner_content(self) -> str:
        """
        Return the concatenated text content of this node and all descendants.
        :return: String containing all text content.
        """
        parts: List[str] = []
        if self.text:
            parts.append(self.text)
        for c in self.children:
            parts.append(c.inner_content)
        return "".join(parts)
    def outer_html(self) -> str:
        """
        Reconstruct the HTML for this node and its subtree.
        :return: HTML string.
        """
        if self.tag is None:
            return self.text
        attrs = "".join(f' {k}="{v}"' for k, v in self.attrs.items())
        inner = "".join(child.outer_html() for child in self.children)
        return f"<{self.tag}{attrs}>{inner}</{self.tag}>"
    def pretty(self, indent: int = 0) -> str:
        """
        Return a pretty-printed representation of the DOM tree.
        :param indent: Current indentation level.
        :return: Multiline string.
        """
        pad = "  " * indent
        if self.tag is None:
            return f"{pad}{self.text!r}"
        attrs = " ".join(f'{k}="{v}"' for k, v in self.attrs.items())
        header = f"{pad}<{self.tag} {attrs}>".rstrip()
        lines = [header]
        for child in self.children:
            lines.append(child.pretty(indent + 1))
        return "\n".join(lines)
 class TreeBuilder(HTMLParser):
    """
    HTML parser that constructs a simple DOM-like tree of :class:`Node` objects.
    """
    def __init__(self) -> None:
        super().__init__()
        self.root: Node = Node(tag="__root__")
        self.current: Node = self.root
    def handle_starttag(self, tag: str, attrs: List[tuple[str, str]]) -> None:
        node = Node(tag=tag, attrs=attrs, parent=self.current)
        self.current.children.append(node)
        self.current = node
    def handle_endtag(self, tag: str) -> None:
        if self.current.parent is not None:
            self.current = self.current.parent
    def handle_data(self, data: str) -> None:
        if data.strip():
            self.current.children.append(Node(text=data, parent=self.current))
--- a/tests/unit/byteb4rb1e/utils/http/parser/init.py
+++ b/tests/unit/byteb4rb1e/utils/http/parser/init.py
--- a/tests/unit/byteb4rb1e/utils/http/parser/test_node.py
+++ b/tests/unit/byteb4rb1e/utils/http/parser/test_node.py
@ -0,0 +1,102 @@
 import pytest
 from byteb4rb1e.utils.http.parser import Node, TreeBuilder
@pytest.fixture
 def sample_dom():
    """
    Build a small DOM tree for testing:
    <div id="root" class="container">
        <p class="text">Hello</p>
        <span class="text highlight">World</span>
        <div class="box">
            <span id="inner">Inside</span>
        </div>
    </div>
    """
    html = """
    <div id="root" class="container">
        <p class="text">Hello</p>
        <span class="text highlight">World</span>
        <div class="box">
            <span id="inner">Inside</span>
        </div>
    </div>
    """
    parser = TreeBuilder()
    parser.feed(html)
    return parser.root.children[0]   # the <div id="root">
 class TestGetElementsByTagName:
    def test_find_all_spans(self, sample_dom):
        spans = sample_dom.get_elements_by_tag_name("span")
        assert len(spans) == 2
        assert spans[0].tag == "span"
        assert spans[1].tag == "span"
    def test_find_no_matches(self, sample_dom):
        assert sample_dom.get_elements_by_tag_name("table") == []
 class TestGetElementsByClassName:
    def test_find_single_class(self, sample_dom):
        items = sample_dom.get_elements_by_class_name("text")
        assert len(items) == 2
    def test_find_multiple_classes(self, sample_dom):
        items = sample_dom.get_elements_by_class_name("highlight")
        assert len(items) == 1
        assert items[0].tag == "span"
    def test_no_such_class(self, sample_dom):
        assert sample_dom.get_elements_by_class_name("missing") == []
 class TestGetElementById:
    def test_find_existing_id(self, sample_dom):
        node = sample_dom.get_element_by_id("inner")
        assert node is not None
        assert node.tag == "span"
        assert node.inner_content == "Inside"
    def test_missing_id(self, sample_dom):
        assert sample_dom.get_element_by_id("nope") is None
 class TestQuerySelectorAll:
    def test_class_selector(self, sample_dom):
        items = sample_dom.query_selector_all(".text")
        assert len(items) == 2
    def test_id_selector(self, sample_dom):
        items = sample_dom.query_selector_all("#inner")
        assert len(items) == 1
        assert items[0].inner_content == "Inside"
    def test_tag_selector(self, sample_dom):
        items = sample_dom.query_selector_all("p")
        assert len(items) == 1
        assert items[0].inner_content == "Hello"
    def test_chained_selector(self, sample_dom):
        items = sample_dom.query_selector_all("div .highlight")
        assert len(items) == 1
        assert items[0].inner_content == "World"
 class TestXPath:
    def test_simple_tag(self, sample_dom):
        spans = sample_dom.xpath("//span")
        assert len(spans) == 2
    def test_attribute_match(self, sample_dom):
        nodes = sample_dom.xpath('//span[@id="inner"]')
        assert len(nodes) == 1
        assert nodes[0].inner_content == "Inside"
    def test_nested(self, sample_dom):
        nodes = sample_dom.xpath("//div[@class='box']")
        assert len(nodes) == 1