From e67a95f15a9d91929d9bed8ed8aac0f3f16717ee Mon Sep 17 00:00:00 2001 From: "Rodney, Tiara" Date: Mon, 5 May 2025 00:47:17 +0200 Subject: [PATCH] feat(string): init Rabin-Karp rolling hash Implemented Rabin-Karp rolling hash class abstraction. After testing muliple algorithms for efficient substring searching in a stream abstracted by a ring buffer, I've dropped the idea of using KMP in favor of implementing my own algorithm based of the Rabin-Karp rolling hash algorithm. --- src/byteb4rb1e_utils/string.py | 91 +++++++++++++++++++ .../unit/byteb4rb1e_utils/string/__init__.py | 0 .../string/test_rolling_hash.py | 61 +++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 src/byteb4rb1e_utils/string.py create mode 100644 tests/unit/byteb4rb1e_utils/string/__init__.py create mode 100644 tests/unit/byteb4rb1e_utils/string/test_rolling_hash.py diff --git a/src/byteb4rb1e_utils/string.py b/src/byteb4rb1e_utils/string.py new file mode 100644 index 0000000..800a5bc --- /dev/null +++ b/src/byteb4rb1e_utils/string.py @@ -0,0 +1,91 @@ +from typing import Optional + + +class RollingHash: + """implementation of Rabin-Karp rolling hash + """ + #: default base + base: int = 31 + #: default modulus + mod: int = 10**9 + 7 + #: current computed hash + _hash: int + #: prime number base (e.g., 31) + _base: int + #: large prime modulus (to prevent overflow) + _mod: int + # Precomputation of ``base^(length-1) % mod`` for removing the old byte when + # rolling over + _hbase_factor: int + + def __init__( + self, + data: bytes, + base: Optional[int] = None, + mod: Optional[int] = None + ): + """Initialize the rolling hash with a given base and modulus. + + base: Prime number base (e.g., 31) + mod: Large prime modulus to prevent overflow + length: Length of the pattern to match + """ + self._base = base if base else RollingHash.base + + self._mod = mod if mod else RollingHash.mod + + self._hash = RollingHash.compute_initial_hash( + data, + self._base, + self._mod + ) + + self._hbase_factor = pow(self._base, len(data) - 1, self._mod) + + @staticmethod + def compute_initial_hash( + data: bytes, + base: int, + mod: int, + ) -> int: + """Compute the hash for the initial window (first `length` bytes). + + rather use this standalone for computing the hash of the search pattern, + to avoid the overhead of instantiating an object. + + :param data: data to build hash for + :param base: + :param: mod: + + :returns: hash of data + """ + hash_ = 0 + for i in range(len(data)): + # computing the modulus at each iteration, as to avoid the summed + # integer to be chunky, as in HUUUUGEE... + hash_ = (hash_ * base + data[i]) % mod + return hash_ + + def roll(self, old_byte: int, new_byte: int) -> int: + """Efficiently update hash by removing ``old_byte`` and adding + ``new_byte`` + + The old_byte removal uses a pre-computed value of the highest base used + in the polynomial calculation. This speeds things up a bit. + + I was thinking about a way on how to store the old_byte efficiently + within the class object, but that would require storing the entire data, + basically doubling the memory consumption as the data must definetly + also live outside of the class object. A memoryview could solve this + problem, but at the cost of making the implementation more complex, so + this will have to do. + + :param old_byte: The ordinal of the first byte in buffer to roll over + :param new_byte: The ordinal of the byte newly appended to the buffer + """ + # Remove old + self._hash = (self._hash - old_byte * self._hbase_factor) % self.mod + # Add new + self._hash = (self._hash * self.base + new_byte) % self.mod + + return self._hash diff --git a/tests/unit/byteb4rb1e_utils/string/__init__.py b/tests/unit/byteb4rb1e_utils/string/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/byteb4rb1e_utils/string/test_rolling_hash.py b/tests/unit/byteb4rb1e_utils/string/test_rolling_hash.py new file mode 100644 index 0000000..52c146a --- /dev/null +++ b/tests/unit/byteb4rb1e_utils/string/test_rolling_hash.py @@ -0,0 +1,61 @@ +import unittest + +from byteb4rb1e_utils.string import RollingHash + +class test_compute_initial_hash(unittest.TestCase): + """RollingHash.compute_initial_hash() + + i calculated the hashes by hand, as a basis for this test case. Hopefully + there's no logical flaw... + """ + + def test_default(self): + """computation of hash""" + result = RollingHash.compute_initial_hash( + b'abcdefg', + base = 31, + mod = 10**9 + 7 + ) + + self.assertEqual(result, 988021244) + +class test___init__(unittest.TestCase): + """RollingHash.__init__() + + Make sure the class instance is initialized correctly + + I calculated the hashes by hand, as a basis for this test case. Hopefully + there's no logical flaw... + """ + + def test_default(self): + """computation of initial hash and highest base factor""" + instance = RollingHash(b'abcdefg') + + self.assertEqual(instance._hash, 988021244) + self.assertEqual(instance._hbase_factor, 887503681) + + def test_defaults_override(self): + """override of defaults""" + instance = RollingHash( + b'abcdefg', + base = 9, + mod = 4 + ) + + self.assertEqual(instance._mod, 4) + self.assertEqual(instance._base, 9) + + +class test_roll(unittest.TestCase): + """RollingHash.roll()""" + def test_rolling_hash(self): + base=31 + mod=10**9 + 7 + + rh = RollingHash(b"foobar", base=base, mod=mod) + rolled_hash = rh.roll(ord("f"), ord("n")) + + control_hash = RollingHash.compute_initial_hash(b"oobarn", base, mod) + + self.assertEqual(rolled_hash, control_hash)