Merge branch 'feature/4' into dev
ID: 4
Type: feature
Title: implement Rabin-Karp rolling hash algorithm
Status: done
Priority: high
Created: 2025-05-05
Description: After testing a couple of string search algorithms, I've ditched
the idea of using KMP as my use-case gives no advantage compared to
naive searching. In addition I've came upon the challenge that many
string search algorithms are optimized for search on a linear
buffer, which in my case is not applicable as the implementation
the search algorithm is for uses a circular buffer. Rabin-Karp
seemed promising. I've come up with a different approach though,
which is still based on rolling hashes, therefore, as a base, I
need an implementation of the original Rabin-Karp rolling hash
algorithm
This commit is contained in:
commit
b1ca25bc52
4 changed files with 153 additions and 1 deletions
2
TODO
2
TODO
|
|
@ -83,7 +83,7 @@ Description: move the unit test suites to a unit/ subdirectory so that
|
||||||
ID: 4
|
ID: 4
|
||||||
Type: feature
|
Type: feature
|
||||||
Title: implement Rabin-Karp rolling hash algorithm
|
Title: implement Rabin-Karp rolling hash algorithm
|
||||||
Status: in-progress
|
Status: done
|
||||||
Priority: high
|
Priority: high
|
||||||
Created: 2025-05-05
|
Created: 2025-05-05
|
||||||
Description: After testing a couple of string search algorithms, I've ditched
|
Description: After testing a couple of string search algorithms, I've ditched
|
||||||
|
|
|
||||||
91
src/byteb4rb1e_utils/string.py
Normal file
91
src/byteb4rb1e_utils/string.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
class RollingHash:
|
||||||
|
"""implementation of Rabin-Karp rolling hash
|
||||||
|
"""
|
||||||
|
#: default base
|
||||||
|
base: int = 31
|
||||||
|
#: default modulus
|
||||||
|
mod: int = 10**9 + 7
|
||||||
|
#: current computed hash
|
||||||
|
_hash: int
|
||||||
|
#: prime number base (e.g., 31)
|
||||||
|
_base: int
|
||||||
|
#: large prime modulus (to prevent overflow)
|
||||||
|
_mod: int
|
||||||
|
# Precomputation of ``base^(length-1) % mod`` for removing the old byte when
|
||||||
|
# rolling over
|
||||||
|
_hbase_factor: int
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
data: bytes,
|
||||||
|
base: Optional[int] = None,
|
||||||
|
mod: Optional[int] = None
|
||||||
|
):
|
||||||
|
"""Initialize the rolling hash with a given base and modulus.
|
||||||
|
|
||||||
|
base: Prime number base (e.g., 31)
|
||||||
|
mod: Large prime modulus to prevent overflow
|
||||||
|
length: Length of the pattern to match
|
||||||
|
"""
|
||||||
|
self._base = base if base else RollingHash.base
|
||||||
|
|
||||||
|
self._mod = mod if mod else RollingHash.mod
|
||||||
|
|
||||||
|
self._hash = RollingHash.compute_initial_hash(
|
||||||
|
data,
|
||||||
|
self._base,
|
||||||
|
self._mod
|
||||||
|
)
|
||||||
|
|
||||||
|
self._hbase_factor = pow(self._base, len(data) - 1, self._mod)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def compute_initial_hash(
|
||||||
|
data: bytes,
|
||||||
|
base: int,
|
||||||
|
mod: int,
|
||||||
|
) -> int:
|
||||||
|
"""Compute the hash for the initial window (first `length` bytes).
|
||||||
|
|
||||||
|
rather use this standalone for computing the hash of the search pattern,
|
||||||
|
to avoid the overhead of instantiating an object.
|
||||||
|
|
||||||
|
:param data: data to build hash for
|
||||||
|
:param base:
|
||||||
|
:param: mod:
|
||||||
|
|
||||||
|
:returns: hash of data
|
||||||
|
"""
|
||||||
|
hash_ = 0
|
||||||
|
for i in range(len(data)):
|
||||||
|
# computing the modulus at each iteration, as to avoid the summed
|
||||||
|
# integer to be chunky, as in HUUUUGEE...
|
||||||
|
hash_ = (hash_ * base + data[i]) % mod
|
||||||
|
return hash_
|
||||||
|
|
||||||
|
def roll(self, old_byte: int, new_byte: int) -> int:
|
||||||
|
"""Efficiently update hash by removing ``old_byte`` and adding
|
||||||
|
``new_byte``
|
||||||
|
|
||||||
|
The old_byte removal uses a pre-computed value of the highest base used
|
||||||
|
in the polynomial calculation. This speeds things up a bit.
|
||||||
|
|
||||||
|
I was thinking about a way on how to store the old_byte efficiently
|
||||||
|
within the class object, but that would require storing the entire data,
|
||||||
|
basically doubling the memory consumption as the data must definetly
|
||||||
|
also live outside of the class object. A memoryview could solve this
|
||||||
|
problem, but at the cost of making the implementation more complex, so
|
||||||
|
this will have to do.
|
||||||
|
|
||||||
|
:param old_byte: The ordinal of the first byte in buffer to roll over
|
||||||
|
:param new_byte: The ordinal of the byte newly appended to the buffer
|
||||||
|
"""
|
||||||
|
# Remove old
|
||||||
|
self._hash = (self._hash - old_byte * self._hbase_factor) % self.mod
|
||||||
|
# Add new
|
||||||
|
self._hash = (self._hash * self.base + new_byte) % self.mod
|
||||||
|
|
||||||
|
return self._hash
|
||||||
0
tests/unit/byteb4rb1e_utils/string/__init__.py
Normal file
0
tests/unit/byteb4rb1e_utils/string/__init__.py
Normal file
61
tests/unit/byteb4rb1e_utils/string/test_rolling_hash.py
Normal file
61
tests/unit/byteb4rb1e_utils/string/test_rolling_hash.py
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from byteb4rb1e_utils.string import RollingHash
|
||||||
|
|
||||||
|
class test_compute_initial_hash(unittest.TestCase):
|
||||||
|
"""RollingHash.compute_initial_hash()
|
||||||
|
|
||||||
|
i calculated the hashes by hand, as a basis for this test case. Hopefully
|
||||||
|
there's no logical flaw...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_default(self):
|
||||||
|
"""computation of hash"""
|
||||||
|
result = RollingHash.compute_initial_hash(
|
||||||
|
b'abcdefg',
|
||||||
|
base = 31,
|
||||||
|
mod = 10**9 + 7
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result, 988021244)
|
||||||
|
|
||||||
|
class test___init__(unittest.TestCase):
|
||||||
|
"""RollingHash.__init__()
|
||||||
|
|
||||||
|
Make sure the class instance is initialized correctly
|
||||||
|
|
||||||
|
I calculated the hashes by hand, as a basis for this test case. Hopefully
|
||||||
|
there's no logical flaw...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_default(self):
|
||||||
|
"""computation of initial hash and highest base factor"""
|
||||||
|
instance = RollingHash(b'abcdefg')
|
||||||
|
|
||||||
|
self.assertEqual(instance._hash, 988021244)
|
||||||
|
self.assertEqual(instance._hbase_factor, 887503681)
|
||||||
|
|
||||||
|
def test_defaults_override(self):
|
||||||
|
"""override of defaults"""
|
||||||
|
instance = RollingHash(
|
||||||
|
b'abcdefg',
|
||||||
|
base = 9,
|
||||||
|
mod = 4
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(instance._mod, 4)
|
||||||
|
self.assertEqual(instance._base, 9)
|
||||||
|
|
||||||
|
|
||||||
|
class test_roll(unittest.TestCase):
|
||||||
|
"""RollingHash.roll()"""
|
||||||
|
def test_rolling_hash(self):
|
||||||
|
base=31
|
||||||
|
mod=10**9 + 7
|
||||||
|
|
||||||
|
rh = RollingHash(b"foobar", base=base, mod=mod)
|
||||||
|
rolled_hash = rh.roll(ord("f"), ord("n"))
|
||||||
|
|
||||||
|
control_hash = RollingHash.compute_initial_hash(b"oobarn", base, mod)
|
||||||
|
|
||||||
|
self.assertEqual(rolled_hash, control_hash)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue