Merge branch 'feature/4' into dev

ID: 4
Type: feature
Title: implement Rabin-Karp rolling hash algorithm
Status: done
Priority: high
Created: 2025-05-05
Description: After testing a couple of string search algorithms, I've ditched
             the idea of using KMP as my use-case gives no advantage compared to
             naive searching. In addition I've came upon the challenge that many
             string search algorithms are optimized for search on a linear
             buffer, which in my case is not applicable as the implementation
             the search algorithm is for uses a circular buffer. Rabin-Karp
             seemed promising. I've come up with a different approach though,
             which is still based on rolling hashes, therefore, as a base, I
             need an implementation of the original Rabin-Karp rolling hash
             algorithm
This commit is contained in:
Rodney, Tiara 2025-05-05 01:33:53 +02:00
commit b1ca25bc52
No known key found for this signature in database
GPG key ID: 5CD8EC1D46106723
4 changed files with 153 additions and 1 deletions

2
TODO
View file

@ -83,7 +83,7 @@ Description: move the unit test suites to a unit/ subdirectory so that
ID: 4
Type: feature
Title: implement Rabin-Karp rolling hash algorithm
Status: in-progress
Status: done
Priority: high
Created: 2025-05-05
Description: After testing a couple of string search algorithms, I've ditched

View file

@ -0,0 +1,91 @@
from typing import Optional
class RollingHash:
"""implementation of Rabin-Karp rolling hash
"""
#: default base
base: int = 31
#: default modulus
mod: int = 10**9 + 7
#: current computed hash
_hash: int
#: prime number base (e.g., 31)
_base: int
#: large prime modulus (to prevent overflow)
_mod: int
# Precomputation of ``base^(length-1) % mod`` for removing the old byte when
# rolling over
_hbase_factor: int
def __init__(
self,
data: bytes,
base: Optional[int] = None,
mod: Optional[int] = None
):
"""Initialize the rolling hash with a given base and modulus.
base: Prime number base (e.g., 31)
mod: Large prime modulus to prevent overflow
length: Length of the pattern to match
"""
self._base = base if base else RollingHash.base
self._mod = mod if mod else RollingHash.mod
self._hash = RollingHash.compute_initial_hash(
data,
self._base,
self._mod
)
self._hbase_factor = pow(self._base, len(data) - 1, self._mod)
@staticmethod
def compute_initial_hash(
data: bytes,
base: int,
mod: int,
) -> int:
"""Compute the hash for the initial window (first `length` bytes).
rather use this standalone for computing the hash of the search pattern,
to avoid the overhead of instantiating an object.
:param data: data to build hash for
:param base:
:param: mod:
:returns: hash of data
"""
hash_ = 0
for i in range(len(data)):
# computing the modulus at each iteration, as to avoid the summed
# integer to be chunky, as in HUUUUGEE...
hash_ = (hash_ * base + data[i]) % mod
return hash_
def roll(self, old_byte: int, new_byte: int) -> int:
"""Efficiently update hash by removing ``old_byte`` and adding
``new_byte``
The old_byte removal uses a pre-computed value of the highest base used
in the polynomial calculation. This speeds things up a bit.
I was thinking about a way on how to store the old_byte efficiently
within the class object, but that would require storing the entire data,
basically doubling the memory consumption as the data must definetly
also live outside of the class object. A memoryview could solve this
problem, but at the cost of making the implementation more complex, so
this will have to do.
:param old_byte: The ordinal of the first byte in buffer to roll over
:param new_byte: The ordinal of the byte newly appended to the buffer
"""
# Remove old
self._hash = (self._hash - old_byte * self._hbase_factor) % self.mod
# Add new
self._hash = (self._hash * self.base + new_byte) % self.mod
return self._hash

View file

@ -0,0 +1,61 @@
import unittest
from byteb4rb1e_utils.string import RollingHash
class test_compute_initial_hash(unittest.TestCase):
"""RollingHash.compute_initial_hash()
i calculated the hashes by hand, as a basis for this test case. Hopefully
there's no logical flaw...
"""
def test_default(self):
"""computation of hash"""
result = RollingHash.compute_initial_hash(
b'abcdefg',
base = 31,
mod = 10**9 + 7
)
self.assertEqual(result, 988021244)
class test___init__(unittest.TestCase):
"""RollingHash.__init__()
Make sure the class instance is initialized correctly
I calculated the hashes by hand, as a basis for this test case. Hopefully
there's no logical flaw...
"""
def test_default(self):
"""computation of initial hash and highest base factor"""
instance = RollingHash(b'abcdefg')
self.assertEqual(instance._hash, 988021244)
self.assertEqual(instance._hbase_factor, 887503681)
def test_defaults_override(self):
"""override of defaults"""
instance = RollingHash(
b'abcdefg',
base = 9,
mod = 4
)
self.assertEqual(instance._mod, 4)
self.assertEqual(instance._base, 9)
class test_roll(unittest.TestCase):
"""RollingHash.roll()"""
def test_rolling_hash(self):
base=31
mod=10**9 + 7
rh = RollingHash(b"foobar", base=base, mod=mod)
rolled_hash = rh.roll(ord("f"), ord("n"))
control_hash = RollingHash.compute_initial_hash(b"oobarn", base, mod)
self.assertEqual(rolled_hash, control_hash)