Skip to content

Commit 6720d8d

Browse files
author
rodrigo.nogueira
committed
feat: add xxHash32 algorithm implementation
- Implement xxHash32 with 4-accumulator parallel processing - Add 15 comprehensive doctests with seed support - Full type hints and English documentation - Passes ruff and mypy checks xxHash is extremely fast and used in Zstandard, LZ4, and rsync. Features optimized processing with carefully selected prime constants.
1 parent 2c15b8c commit 6720d8d

File tree

1 file changed

+192
-0
lines changed

1 file changed

+192
-0
lines changed

hashes/xxhash.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""
2+
xxHash Algorithm
3+
4+
This module implements the xxHash32 algorithm, an extremely fast non-cryptographic
5+
hash function designed to operate at RAM speed limits.
6+
7+
xxHash is known for:
8+
- Extreme speed: One of the fastest hash algorithms available
9+
- Excellent distribution: Passes SMHasher test suite
10+
- Portability: Identical output across all platforms
11+
- Simplicity: Compact implementation with good performance
12+
13+
Common uses:
14+
- Zstandard compression (Facebook's compression algorithm)
15+
- LZ4 compression
16+
- rsync file synchronization
17+
- Database indexing and deduplication
18+
- Hash tables requiring high performance
19+
20+
The algorithm uses carefully selected prime numbers and processes data in 16-byte
21+
stripes using four parallel accumulators for maximum speed.
22+
23+
Note: xxHash is NOT cryptographically secure. Use SHA-256 or similar for security.
24+
25+
References:
26+
- https://github.com/Cyan4973/xxHash
27+
- https://xxhash.com/
28+
- https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
29+
"""
30+
31+
32+
def _rotl32(value: int, amount: int) -> int:
33+
"""
34+
Rotate a 32-bit integer left by the specified amount.
35+
36+
Args:
37+
value: 32-bit integer to rotate
38+
amount: Number of bits to rotate (0-31)
39+
40+
Returns:
41+
Rotated 32-bit integer
42+
43+
>>> _rotl32(0x12345678, 8)
44+
878082066
45+
>>> _rotl32(0xFFFFFFFF, 1)
46+
4294967295
47+
"""
48+
return ((value << amount) | (value >> (32 - amount))) & 0xFFFFFFFF
49+
50+
51+
def xxhash32(data: bytes, seed: int = 0) -> int:
52+
"""
53+
Calculate the xxHash32 hash of byte data.
54+
55+
xxHash32 is extremely fast and has excellent distribution properties.
56+
It processes data in 16-byte chunks using four parallel accumulators.
57+
58+
Args:
59+
data: Byte data to hash
60+
seed: Optional seed value (default: 0) for hash initialization
61+
62+
Returns:
63+
32-bit hash value (0 to 4,294,967,295)
64+
65+
Raises:
66+
TypeError: If data is not bytes
67+
ValueError: If seed is negative or > 32-bit max
68+
69+
>>> xxhash32(b"")
70+
46947589
71+
72+
>>> xxhash32(b"hello")
73+
4211111929
74+
75+
>>> xxhash32(b"Hello")
76+
4060533391
77+
78+
>>> xxhash32(b"world")
79+
413819571
80+
81+
>>> xxhash32(b"The quick brown fox jumps over the lazy dog")
82+
3898516702
83+
84+
>>> xxhash32(b"a")
85+
1426945110
86+
87+
>>> xxhash32(b"abc")
88+
852579327
89+
90+
>>> xxhash32(b"Python")
91+
1196663540
92+
93+
>>> xxhash32(b"xxHash")
94+
2929943677
95+
96+
>>> xxhash32(b"\\x00\\x00\\x00\\x00")
97+
148298089
98+
99+
>>> xxhash32(b"test" * 100) != xxhash32(b"test" * 101)
100+
True
101+
102+
>>> xxhash32(b"hello", seed=42)
103+
1292028262
104+
105+
>>> xxhash32(b"hello", seed=0) != xxhash32(b"hello", seed=1)
106+
True
107+
"""
108+
if not isinstance(data, bytes):
109+
msg = f"data must be bytes, not {type(data).__name__}"
110+
raise TypeError(msg)
111+
112+
if seed < 0 or seed > 0xFFFFFFFF:
113+
msg = f"seed must be between 0 and {0xFFFFFFFF}"
114+
raise ValueError(msg)
115+
116+
prime1 = 0x9E3779B1
117+
prime2 = 0x85EBCA77
118+
prime3 = 0xC2B2AE3D
119+
prime4 = 0x27D4EB2F
120+
prime5 = 0x165667B1
121+
122+
length = len(data)
123+
index = 0
124+
125+
if length >= 16:
126+
limit = length - 16
127+
128+
acc1 = (seed + prime1 + prime2) & 0xFFFFFFFF
129+
acc2 = (seed + prime2) & 0xFFFFFFFF
130+
acc3 = seed & 0xFFFFFFFF
131+
acc4 = (seed - prime1) & 0xFFFFFFFF
132+
133+
while index <= limit:
134+
lane1 = int.from_bytes(data[index : index + 4], byteorder="little")
135+
lane2 = int.from_bytes(data[index + 4 : index + 8], byteorder="little")
136+
lane3 = int.from_bytes(data[index + 8 : index + 12], byteorder="little")
137+
lane4 = int.from_bytes(data[index + 12 : index + 16], byteorder="little")
138+
139+
acc1 = _rotl32((acc1 + lane1 * prime2) & 0xFFFFFFFF, 13)
140+
acc1 = (acc1 * prime1) & 0xFFFFFFFF
141+
142+
acc2 = _rotl32((acc2 + lane2 * prime2) & 0xFFFFFFFF, 13)
143+
acc2 = (acc2 * prime1) & 0xFFFFFFFF
144+
145+
acc3 = _rotl32((acc3 + lane3 * prime2) & 0xFFFFFFFF, 13)
146+
acc3 = (acc3 * prime1) & 0xFFFFFFFF
147+
148+
acc4 = _rotl32((acc4 + lane4 * prime2) & 0xFFFFFFFF, 13)
149+
acc4 = (acc4 * prime1) & 0xFFFFFFFF
150+
151+
index += 16
152+
153+
hash_value = (
154+
_rotl32(acc1, 1) + _rotl32(acc2, 7) + _rotl32(acc3, 12) + _rotl32(acc4, 18)
155+
)
156+
hash_value &= 0xFFFFFFFF
157+
else:
158+
hash_value = (seed + prime5) & 0xFFFFFFFF
159+
160+
hash_value = (hash_value + length) & 0xFFFFFFFF
161+
162+
while index + 4 <= length:
163+
lane = int.from_bytes(data[index : index + 4], byteorder="little")
164+
hash_value = (hash_value + lane * prime3) & 0xFFFFFFFF
165+
hash_value = _rotl32(hash_value, 17)
166+
hash_value = (hash_value * prime4) & 0xFFFFFFFF
167+
index += 4
168+
169+
while index < length:
170+
lane = data[index]
171+
hash_value = (hash_value + lane * prime5) & 0xFFFFFFFF
172+
hash_value = _rotl32(hash_value, 11)
173+
hash_value = (hash_value * prime1) & 0xFFFFFFFF
174+
index += 1
175+
176+
hash_value ^= hash_value >> 15
177+
hash_value = (hash_value * prime2) & 0xFFFFFFFF
178+
hash_value ^= hash_value >> 13
179+
hash_value = (hash_value * prime3) & 0xFFFFFFFF
180+
hash_value ^= hash_value >> 16
181+
182+
return hash_value
183+
184+
185+
if __name__ == "__main__":
186+
import doctest
187+
188+
doctest.testmod()
189+
190+
test_data = b"Hello, xxHash!"
191+
print(f"xxHash32 of '{test_data.decode()}': {xxhash32(test_data)}")
192+
print(f"xxHash32 with seed=42: {xxhash32(test_data, seed=42)}")

0 commit comments

Comments
 (0)