|
| 1 | +""" |
| 2 | +xxHash Algorithm |
| 3 | +
|
| 4 | +This module implements the xxHash32 algorithm, an extremely fast non-cryptographic |
| 5 | +hash function designed to operate at RAM speed limits. |
| 6 | +
|
| 7 | +xxHash is known for: |
| 8 | +- Extreme speed: One of the fastest hash algorithms available |
| 9 | +- Excellent distribution: Passes SMHasher test suite |
| 10 | +- Portability: Identical output across all platforms |
| 11 | +- Simplicity: Compact implementation with good performance |
| 12 | +
|
| 13 | +Common uses: |
| 14 | +- Zstandard compression (Facebook's compression algorithm) |
| 15 | +- LZ4 compression |
| 16 | +- rsync file synchronization |
| 17 | +- Database indexing and deduplication |
| 18 | +- Hash tables requiring high performance |
| 19 | +
|
| 20 | +The algorithm uses carefully selected prime numbers and processes data in 16-byte |
| 21 | +stripes using four parallel accumulators for maximum speed. |
| 22 | +
|
| 23 | +Note: xxHash is NOT cryptographically secure. Use SHA-256 or similar for security. |
| 24 | +
|
| 25 | +References: |
| 26 | +- https://github.com/Cyan4973/xxHash |
| 27 | +- https://xxhash.com/ |
| 28 | +- https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md |
| 29 | +""" |
| 30 | + |
| 31 | + |
| 32 | +def _rotl32(value: int, amount: int) -> int: |
| 33 | + """ |
| 34 | + Rotate a 32-bit integer left by the specified amount. |
| 35 | +
|
| 36 | + Args: |
| 37 | + value: 32-bit integer to rotate |
| 38 | + amount: Number of bits to rotate (0-31) |
| 39 | +
|
| 40 | + Returns: |
| 41 | + Rotated 32-bit integer |
| 42 | +
|
| 43 | + >>> _rotl32(0x12345678, 8) |
| 44 | + 878082066 |
| 45 | + >>> _rotl32(0xFFFFFFFF, 1) |
| 46 | + 4294967295 |
| 47 | + """ |
| 48 | + return ((value << amount) | (value >> (32 - amount))) & 0xFFFFFFFF |
| 49 | + |
| 50 | + |
| 51 | +def xxhash32(data: bytes, seed: int = 0) -> int: |
| 52 | + """ |
| 53 | + Calculate the xxHash32 hash of byte data. |
| 54 | +
|
| 55 | + xxHash32 is extremely fast and has excellent distribution properties. |
| 56 | + It processes data in 16-byte chunks using four parallel accumulators. |
| 57 | +
|
| 58 | + Args: |
| 59 | + data: Byte data to hash |
| 60 | + seed: Optional seed value (default: 0) for hash initialization |
| 61 | +
|
| 62 | + Returns: |
| 63 | + 32-bit hash value (0 to 4,294,967,295) |
| 64 | +
|
| 65 | + Raises: |
| 66 | + TypeError: If data is not bytes |
| 67 | + ValueError: If seed is negative or > 32-bit max |
| 68 | +
|
| 69 | + >>> xxhash32(b"") |
| 70 | + 46947589 |
| 71 | +
|
| 72 | + >>> xxhash32(b"hello") |
| 73 | + 4211111929 |
| 74 | +
|
| 75 | + >>> xxhash32(b"Hello") |
| 76 | + 4060533391 |
| 77 | +
|
| 78 | + >>> xxhash32(b"world") |
| 79 | + 413819571 |
| 80 | +
|
| 81 | + >>> xxhash32(b"The quick brown fox jumps over the lazy dog") |
| 82 | + 3898516702 |
| 83 | +
|
| 84 | + >>> xxhash32(b"a") |
| 85 | + 1426945110 |
| 86 | +
|
| 87 | + >>> xxhash32(b"abc") |
| 88 | + 852579327 |
| 89 | +
|
| 90 | + >>> xxhash32(b"Python") |
| 91 | + 1196663540 |
| 92 | +
|
| 93 | + >>> xxhash32(b"xxHash") |
| 94 | + 2929943677 |
| 95 | +
|
| 96 | + >>> xxhash32(b"\\x00\\x00\\x00\\x00") |
| 97 | + 148298089 |
| 98 | +
|
| 99 | + >>> xxhash32(b"test" * 100) != xxhash32(b"test" * 101) |
| 100 | + True |
| 101 | +
|
| 102 | + >>> xxhash32(b"hello", seed=42) |
| 103 | + 1292028262 |
| 104 | +
|
| 105 | + >>> xxhash32(b"hello", seed=0) != xxhash32(b"hello", seed=1) |
| 106 | + True |
| 107 | + """ |
| 108 | + if not isinstance(data, bytes): |
| 109 | + msg = f"data must be bytes, not {type(data).__name__}" |
| 110 | + raise TypeError(msg) |
| 111 | + |
| 112 | + if seed < 0 or seed > 0xFFFFFFFF: |
| 113 | + msg = f"seed must be between 0 and {0xFFFFFFFF}" |
| 114 | + raise ValueError(msg) |
| 115 | + |
| 116 | + prime1 = 0x9E3779B1 |
| 117 | + prime2 = 0x85EBCA77 |
| 118 | + prime3 = 0xC2B2AE3D |
| 119 | + prime4 = 0x27D4EB2F |
| 120 | + prime5 = 0x165667B1 |
| 121 | + |
| 122 | + length = len(data) |
| 123 | + index = 0 |
| 124 | + |
| 125 | + if length >= 16: |
| 126 | + limit = length - 16 |
| 127 | + |
| 128 | + acc1 = (seed + prime1 + prime2) & 0xFFFFFFFF |
| 129 | + acc2 = (seed + prime2) & 0xFFFFFFFF |
| 130 | + acc3 = seed & 0xFFFFFFFF |
| 131 | + acc4 = (seed - prime1) & 0xFFFFFFFF |
| 132 | + |
| 133 | + while index <= limit: |
| 134 | + lane1 = int.from_bytes(data[index : index + 4], byteorder="little") |
| 135 | + lane2 = int.from_bytes(data[index + 4 : index + 8], byteorder="little") |
| 136 | + lane3 = int.from_bytes(data[index + 8 : index + 12], byteorder="little") |
| 137 | + lane4 = int.from_bytes(data[index + 12 : index + 16], byteorder="little") |
| 138 | + |
| 139 | + acc1 = _rotl32((acc1 + lane1 * prime2) & 0xFFFFFFFF, 13) |
| 140 | + acc1 = (acc1 * prime1) & 0xFFFFFFFF |
| 141 | + |
| 142 | + acc2 = _rotl32((acc2 + lane2 * prime2) & 0xFFFFFFFF, 13) |
| 143 | + acc2 = (acc2 * prime1) & 0xFFFFFFFF |
| 144 | + |
| 145 | + acc3 = _rotl32((acc3 + lane3 * prime2) & 0xFFFFFFFF, 13) |
| 146 | + acc3 = (acc3 * prime1) & 0xFFFFFFFF |
| 147 | + |
| 148 | + acc4 = _rotl32((acc4 + lane4 * prime2) & 0xFFFFFFFF, 13) |
| 149 | + acc4 = (acc4 * prime1) & 0xFFFFFFFF |
| 150 | + |
| 151 | + index += 16 |
| 152 | + |
| 153 | + hash_value = ( |
| 154 | + _rotl32(acc1, 1) + _rotl32(acc2, 7) + _rotl32(acc3, 12) + _rotl32(acc4, 18) |
| 155 | + ) |
| 156 | + hash_value &= 0xFFFFFFFF |
| 157 | + else: |
| 158 | + hash_value = (seed + prime5) & 0xFFFFFFFF |
| 159 | + |
| 160 | + hash_value = (hash_value + length) & 0xFFFFFFFF |
| 161 | + |
| 162 | + while index + 4 <= length: |
| 163 | + lane = int.from_bytes(data[index : index + 4], byteorder="little") |
| 164 | + hash_value = (hash_value + lane * prime3) & 0xFFFFFFFF |
| 165 | + hash_value = _rotl32(hash_value, 17) |
| 166 | + hash_value = (hash_value * prime4) & 0xFFFFFFFF |
| 167 | + index += 4 |
| 168 | + |
| 169 | + while index < length: |
| 170 | + lane = data[index] |
| 171 | + hash_value = (hash_value + lane * prime5) & 0xFFFFFFFF |
| 172 | + hash_value = _rotl32(hash_value, 11) |
| 173 | + hash_value = (hash_value * prime1) & 0xFFFFFFFF |
| 174 | + index += 1 |
| 175 | + |
| 176 | + hash_value ^= hash_value >> 15 |
| 177 | + hash_value = (hash_value * prime2) & 0xFFFFFFFF |
| 178 | + hash_value ^= hash_value >> 13 |
| 179 | + hash_value = (hash_value * prime3) & 0xFFFFFFFF |
| 180 | + hash_value ^= hash_value >> 16 |
| 181 | + |
| 182 | + return hash_value |
| 183 | + |
| 184 | + |
| 185 | +if __name__ == "__main__": |
| 186 | + import doctest |
| 187 | + |
| 188 | + doctest.testmod() |
| 189 | + |
| 190 | + test_data = b"Hello, xxHash!" |
| 191 | + print(f"xxHash32 of '{test_data.decode()}': {xxhash32(test_data)}") |
| 192 | + print(f"xxHash32 with seed=42: {xxhash32(test_data, seed=42)}") |
0 commit comments