chore: rename package

2025-06-20 20:33:37 +02:00 · 2025-06-20 20:33:37 +02:00 · 1fb1e0d0bf
commit 1fb1e0d0bf
parent dd57ecabb9
11 changed files with 7 additions and 7 deletions
--- a/src/byteb4rb1e/utils/collections.py
+++ b/src/byteb4rb1e/utils/collections.py
@ -0,0 +1,37 @@
+class CircularBuffer:
+    """circular buffer implementation for managing streamed data
+    """
+    #: internal buffer storage maintaining a fixed size
+    buf: bytearray
+    #: maximum capacity of the buffer
+    size: int
+    #: index of the oldest element in the buffer
+    start: int
+    #: index where the next element will be inserted
+    end: int
+    #: indicates whether the buffer has overwritten older data
+    filled: bool
+
+    def __init__(self, size: int):
+        """initializes the circular buffer with a fixed capacity
+
+        :param size: maximum number of bytes the buffer can hold
+        """
+        self.buf = bytearray(size)
+        self.size = size
+        self.start = 0
+        self.end = 0
+        self.filled = False
+
+    def append(self, data: bytes):
+        """adds data to the circular buffer, overwriting old data if necessary
+
+        :param data: byte sequence to append to the buffer
+        """
+        for byte in data:
+            self.buf[self.end] = byte
+            self.end = (self.end + 1) % self.size
+            if self.end == self.start:  # Overwriting case
+                self.start = (self.start + 1) % self.size
+                self.filled = True
+
--- a/src/byteb4rb1e/utils/http/init.py
+++ b/src/byteb4rb1e/utils/http/init.py
--- a/src/byteb4rb1e/utils/http/server/init.py
+++ b/src/byteb4rb1e/utils/http/server/init.py
@ -0,0 +1,129 @@
+from dataclasses import dataclass
+from http.server import SimpleHTTPRequestHandler
+
+from byteb4rb1e.utils.io import ChunksIO
+
+
+@dataclass
+class HandlerOptions:
+    """configuration options of the HTTP POST method handler
+    """
+    max_chunk_size: int = ChunksIO.max_chunk_size
+    # default (in memory) buffer size in bytes (from KiB) of the sliding buffer
+    # reading from the pure (unchunked) client read stream
+    buffer_size: int = 512 * 1024
+
+
+@dataclass
+class ServerOptions:
+    """configuration options of the HTTP server
+    """
+    handler: HandlerOptions
+    hostname: str = ''
+    port: int = 8000
+
+
+class MultipartUploadHandler(SimpleHTTPRequestHandler):
+    """Simple, yet compliant HTTP/1.0 MIME Multipart Upload Handler
+
+        Implementation of a RFC1341 & RFC7578 compliant server for handling
+        multipart uploads.
+
+        This is meant as a utility for debugging MIME Multipart upload clients
+
+        Support for:
+            - client 'Expect' header
+            - chunked transfer-encoding
+    """
+    media_subtypes = [
+        'mixed',
+        'alternative',
+        'parallel',
+        'digest',
+        'form-data'
+    ]
+
+    def do_POST(self):
+        h_content_type = self.headers.get('Content-Type')
+        h_expect = self.headers.get('Expect')
+        h_transfer_encoding = self.headers.get('Transfer-Encoding')
+
+        if h_content_type == None:
+            self.send_error(400, 'Missing \'Content-Type\' header')
+
+        content_type_segments = [s.strip() for s in h_content_type.split(';')]
+        try:
+            media_type, media_subtype = content_type_segments[0].split('/', 1)
+        except IndexError:
+            self.send_error(
+                400,
+                'no value was supplied for \'Content-Type\' header'
+            )
+        except ValueError:
+            self.send_error(
+                400,
+                'unable to parse media type and subtype from ' +
+                'first (semicolon-delimited) segment of \'Content-Type\' ' +
+                f'header value: {content_type_segments[0]}'
+            )
+
+        if media_type != 'multipart':
+            self.send_error(
+                400,
+                'unsupported media type in \'Content-Type\' header value: ' +
+                f'{media_type}'
+            )
+        elif media_subtype not in self.media_subtypes:
+            self.send_error(
+                400,
+                'unsupported media sub-type in \'Content-Type\' header value: ' +
+                f'{media_type}. Must be one of {", ".join(self.media_subtypes)}'
+            )
+
+        if h_transfer_encoding:
+            if h_transfer_encoding != 'chunked':
+                self.send_error(
+                    501,
+                    f'unable to handle transfer-encoding: {h_transfer_encoding}'
+                )
+
+        content_type_params = {v[0].strip():v[1].strip() for v in [
+            s.split('=', 1) for s in content_type_segments[1:]
+        ]}
+
+        boundary = content_type_params.get('boundary', '')
+        boundary_len = len(boundary)
+        if boundary == '':
+            self.send_error(
+                400,
+                'missing \'boundary\' parameter in \'Content-Type\' header field'
+            )
+        elif boundary_len > 70:
+            self.send_error(
+                400,
+                '\'boundary\' parameter value in \'Content-Type\' too long. ' +
+                f'Is {boundary_len} characters long, must be less than 70.'
+            )
+
+        del content_type_params['boundary']
+        content_type_params_keys = content_type_params.keys()
+        if len(content_type_params_keys) > 0:
+            self.send_error(
+                400,
+                'None other than \'boundary\' parameter in \'Content-Type\'' +
+                'header expected. Also received ' +
+                '{\', \'.join(content_type_param_keys)}'
+            )
+
+
+        self.handle_expect_100()
+
+        # read the first 4-bytes of the body to check if it has a preamble
+        # indication
+
+        # well great... curl is not RFC 1341 compliant. And RFC 1341 is asking
+        # for tolerance towards non-compliant clients...
+
+        self.send_response(200, 'OK')
+        self.end_headers()
+
--- a/src/byteb4rb1e/utils/http/server/main.py
+++ b/src/byteb4rb1e/utils/http/server/main.py
@ -0,0 +1,88 @@
+from argparse import (
+    ArgumentParser,
+    ArgumentDefaultsHelpFormatter,
+    RawDescriptionHelpFormatter
+)
+from dataclasses import dataclass
+from http.server import HTTPServer
+from io import BytesIO, IOBase
+from typing import Optional, Tuple, List
+
+from byteb4rb1e.utils.http.server import (
+    HandlerOptions,
+    MultipartUploadHandler,
+    ServerOptions,
+)
+from byteb4rb1e.utils.io import ChunksIO
+
+
+__doc__ = """tsmuds - Tiara's Simple Multipart Upload Debugging Server
+
+This is a simple standalone implementation of a HTTP/1.x multipart upload server
+using the Python 3.9+ standard library - with an interface catered explicitly
+towards debugging misbehaved clients.
+
+Examples:
+
+    python3 tsmuds.py --port 8000
+"""
+__author__ = "Tiara Rodney <tiara.rodney@administratrix.de>"
+
+
+class CustomArgparseFormatter(
+    ArgumentDefaultsHelpFormatter, 
+    RawDescriptionHelpFormatter
+):
+    """custom formatter that shows defaults for arguments as well as keeps
+    my docstring unformatted.
+    """
+
+
+argparser = ArgumentParser(
+    prog = 'byteb4rb1e.http.server',
+    formatter_class = CustomArgparseFormatter,
+    description = __doc__,
+    epilog = f"""(c) 2025, {__author__}
+
+This software is licensed under the Creative Commons Attribution 4.0
+International License (CC BY 4.0). For more details, visit:
+https://creativecommons.org/licenses/by/4.0/
+"""
+)
+
+argparser.add_argument(
+    '--port',
+    type=int,
+    default=ServerOptions.port,
+    help="bind to this port"
+)
+
+argparser.add_argument(
+    '-b',
+    '--bind',
+    type=str,
+    default=ServerOptions.hostname,
+    help="bind to this address"
+)
+
+argparser.add_argument(
+    '--max-chunk-size',
+    type=int,
+    metavar='INT',
+    default=(ChunksIO.max_chunk_size / (1024 ** 2)),
+    help="""maximum allowed size of chunk (in MiB) when RFC 9112 chunk
+            transfer encoding is requested by client"""
+)
+
+args = argparser.parse_args()
+
+server_options = ServerOptions(
+    port = args.port,
+    handler = HandlerOptions(
+        max_chunk_size = args.max_chunk_size
+    ),
+)
+
+with HTTPServer(server_options, MultipartUploadHandler) as httpd:
+    print("serving at port", server_options.port)
+    httpd.serve_forever()
--- a/src/byteb4rb1e/utils/io/init.py
+++ b/src/byteb4rb1e/utils/io/init.py
@ -0,0 +1,233 @@
+from io import BytesIO, IOBase
+import math
+from typing import Optional, Tuple, List
+
+
+class ChunksIO(IOBase):
+    """handler for HTTP/1.1 chunked transfer-encoded (RFC 9112 §7) byte streams
+
+    Compact and predictable implementation of a RFC 9112 compliant stream
+    handler, which exposes a common IOBase interface for treating chunked byte
+    streams as pure, unencoded byte streams.
+
+    .. notice::
+
+        The implementation is currently only concerned with read operations,
+        though the layout is prepared for an easy straightforward implementation
+        of write operations.
+    """
+    #: maximum allowed size of a chunk
+    # MiB by default, just guessing 10 MiB is a sensible limit
+    max_chunk_size = int(10 * (1024 ** 2))
+    #: optional write-through buffer
+    _buffer: BytesIO
+    #: chunks sizes
+    _chunks_size: List[Tuple[int, int]]
+    #: index of current chunk
+    _current_chunk: int
+    #: cursor position on the underlying stream, as the stream is not expected
+    #  to implement ``tell()``. Limiting factor of how large the stream may be.
+    #  Look at ``sys.maxsize`` for more information.
+    _cursor: int
+    #: chunk encoded stream
+    _stream: BytesIO
+
+    def __init__(
+        self,
+        stream: BytesIO,
+        buffer: Optional[BytesIO] = None,
+    ):
+        """initialize the instance
+
+        .. notice::
+
+            The write-through buffer is required to be seekable, writable and
+            readable and MUST be considered locked during any operation of the
+            ChunksIO implementation. The buffer's cursor position does not
+            reflect the cursor position of the underlying stream.
+
+        :param stream: a byte-stream to abstract
+        :param buffer: write-through buffer for all read operations on the
+                       underlying stream. This can be useful, if the data needs
+                       to be accessed again later on.
+        :param max_chunk_size: the maximum size of a single chunk (excluding
+                               it's bytes size segment)
+        """
+        if stream.readable() == False:
+            raise Exception('expected readable stream')
+
+        if buffer != None:
+            if buffer.writable() == False:
+                raise Exception('expected writable buffer')
+
+        self._buffer = buffer
+        self._chunks_size = []
+        self._current_chunk = 0
+        self._cursor = 0
+        self._stream = stream
+
+        super().__init__()
+
+    @staticmethod
+    def get_chunk_size(
+        stream: BytesIO, 
+        max_size: int,
+    ) -> Tuple[int, int]:
+        """get the size of the next chunk from a RFC 9112 (§7) chunk encoded
+        byte stream
+
+        stream cursor position is assumed to be at the start of the preceeding
+        byte size segment of chunk data. The max_size parameter is converted to
+        its bytes representation, to determine early on if a read is feasible
+        and won't cause a denial-of-service.
+
+        :param stream: the stream to read the chunk size from
+        :param max_size: the maximum allowed size a chunk can be. I wasn't able
+                         to find a definitive limit defined in the RFC so this
+                         is guess working and at least curl has a pretty big
+                         chunk size of more than 6 MiB.
+
+        :returns: tuple of the size of the bytes size segment and the data
+                  bytes size, whose sum is the total size of the chunk
+        """
+        _terminator = b'\r\n'
+        chunk_size = b''
+        terminator = b''
+
+        # calculate the number of bytes the max_size byte representation
+        # requires. This is a precaution so that chunks can't be arbitrarily
+        # long.
+        max_size_bytes = math.ceil(max_size.bit_length() / 8)
+
+        # the iteration could be handled with less system calls by reading a
+        # larger *chunk* of data and iterating over that in-memory cache.
+        # Though, this would come at the expense of unpredictable memory
+        # consumption and would require a write-through buffer by default, in
+        # addition to making the implementation more complex.
+        for _ in range(max_size_bytes + len(_terminator)):
+            buf = stream.read(1)
+
+            if buf in _terminator: terminator += buf
+            else: chunk_size += buf
+
+            if terminator == _terminator: 
+                if (not chunk_size):
+                    raise ValueError(
+                        'terminator reached without having parsed ' +
+                        'any byte size'
+                    )
+
+                return (
+                    len(chunk_size + terminator),
+                    int.from_bytes(chunk_size, byteorder='big')
+                )
+
+        raise ValueError(
+            'unable to reach terminator with a max chunk size of ' +
+            f'{max_size / (1024 ** 2)} MiB'
+        )
+
+    def read(self, size = -1) -> bytes:
+        """read an arbitrary amount of data from the underlying stream.
+        """
+        buffer = b''
+
+        # if no chunk has been read yet
+        if len(self._chunks_size) == 0:
+            # determine the size of the initial chunk
+            try:
+                ichunk_size = ChunksIO.get_chunk_size(
+                    self._stream,
+                    self.max_chunk_size
+                )
+            except ValueError as e:
+                raise ValueError(
+                    f'chunk #{self._current_chunk}: {e}'
+                ) from e
+
+            self._chunks_size.append(ichunk_size)
+
+            self._cursor += self._chunks_size[self._current_chunk][0]
+
+        # end position of current chunk
+        cc_end = sum(
+            [sum(c) for c in self._chunks_size[:self._current_chunk + 1]]
+        )
+
+        # if the requested read end position exceeds the end position of the
+        # current chunk and it's not the end chunk
+        if self._cursor + size > cc_end and \
+           self._chunks_size[self._current_chunk][1] != 0:
+            # size of remaining bytes to read from current chunk
+            cc_remaining = cc_end - self._cursor
+
+            buffer += self._stream.read(cc_remaining)
+
+            if len(buffer) != cc_remaining:
+                raise ValueError(
+                    f'chunk #{self._current_chunk}: stream yielded too few bytes'
+                )
+
+            if self._buffer: self._buffer.write(buffer)
+
+            # determine the size of the next chunk
+            try:
+                chunk_size = ChunksIO.get_chunk_size(
+                    self._stream,
+                    self.max_chunk_size
+                )
+            except ValueError as e:
+                raise ValueError(
+                    f'chunk #{self._current_chunk + 1}: {e}'
+                ) from e
+
+            self._chunks_size.append(chunk_size)
+
+            self._current_chunk += 1
+
+            self._cursor += self._chunks_size[self._current_chunk][0]
+
+            size = size - cc_remaining
+
+        buffer += self._stream.read(size)
+
+        if self._buffer: self._buffer.write(buffer)
+
+        self._cursor += len(buffer)
+
+        return buffer
+
+    def readable() -> bool:
+        """
+        """
+        return True
+
+    def readChunk() -> bytes:
+        """read until the end of a chunk
+
+        if buffered and cursor is not at the start position of a chunk, position
+        will be seeked backwards, prior to reading. If unbuffered and not at the
+        start position of a chunk, exception will be raised.
+        """
+        if self._cursor != self._offset:
+            raise Exception(
+                'cursor not at starting position of a chunk. Mixing ' +
+                'read() and readChunk() calls is currently not supported.'
+            )
+
+        buffer = self.read(self._chunks_size[self._current_chunk])
+
+        if self._buffer: self._buffer.write(buffer)
+
+        self._cursor += len(buffer)
+
+        return buffer
+
+    def readChunks() -> bytes:
+        """yield all chunks until the terminating 0 byte chunk is reached
+        """
+
+    def tell() -> int:
+        """return the current stream position
+        """
+        return this._cursor
--- a/src/byteb4rb1e/utils/string.py
+++ b/src/byteb4rb1e/utils/string.py
@ -0,0 +1,91 @@
+from typing import Optional
+
+
+class RollingHash:
+    """implementation of Rabin-Karp rolling hash
+    """
+    #: default base
+    base: int = 31
+    #: default modulus
+    mod: int = 10**9 + 7
+    #: current computed hash
+    _hash: int
+    #: prime number base (e.g., 31)
+    _base: int
+    #: large prime modulus (to prevent overflow)
+    _mod: int
+    # Precomputation of ``base^(length-1) % mod`` for removing the old byte when
+    # rolling over
+    _hbase_factor: int
+
+    def __init__(
+        self,
+        data: bytes,
+        base: Optional[int] = None,
+        mod: Optional[int] = None
+    ):
+        """Initialize the rolling hash with a given base and modulus.
+
+        base: Prime number base (e.g., 31)
+        mod: Large prime modulus to prevent overflow
+        length: Length of the pattern to match
+        """
+        self._base = base if base else RollingHash.base
+
+        self._mod = mod if mod else RollingHash.mod
+
+        self._hash = RollingHash.compute_initial_hash(
+            data,
+            self._base,
+            self._mod
+        )
+
+        self._hbase_factor = pow(self._base, len(data) - 1, self._mod)
+
+    @staticmethod
+    def compute_initial_hash(
+        data: bytes,
+        base: int,
+        mod: int,
+    ) -> int:
+        """Compute the hash for the initial window (first `length` bytes).
+
+        rather use this standalone for computing the hash of the search pattern,
+        to avoid the overhead of instantiating an object.
+
+        :param data: data to build hash for
+        :param base: 
+        :param: mod:
+
+        :returns: hash of data
+        """
+        hash_ = 0
+        for i in range(len(data)):
+            # computing the modulus at each iteration, as to avoid the summed
+            # integer to be chunky, as in HUUUUGEE...
+            hash_ = (hash_ * base + data[i]) % mod
+        return hash_
+
+    def roll(self, old_byte: int, new_byte: int) -> int:
+        """Efficiently update hash by removing ``old_byte`` and adding
+        ``new_byte``
+
+        The old_byte removal uses a pre-computed value of the highest base used
+        in the polynomial calculation. This speeds things up a bit.
+
+        I was thinking about a way on how to store the old_byte efficiently
+        within the class object, but that would require storing the entire data,
+        basically doubling the memory consumption as the data must definetly
+        also live outside of the class object. A memoryview could solve this
+        problem, but at the cost of making the implementation more complex, so
+        this will have to do.
+
+        :param old_byte: The ordinal of the first byte in buffer to roll over
+        :param new_byte: The ordinal of the byte newly appended to the buffer
+        """
+        # Remove old
+        self._hash = (self._hash - old_byte * self._hbase_factor) % self.mod
+        # Add new
+        self._hash = (self._hash * self.base + new_byte) % self.mod
+
+        return self._hash