Commit aee6f7d7ee

Francesco Alemanno <50984334+francescoalemanno@users.noreply.github.com>
2024-10-30 14:14:12
std.hash: improve simple hashing of unsigned integers
Before, the default bit mixer was very biased, and after a lot of searching it turns out that selecting a better solution is hard. I wrote a custom statistical analysis taylored for bit mixers in order to select the best one at each size (u64/u32/u16), compared a lot of mixers, and packaged the best ones in this commit.
1 parent e2f24a2
Changed files (1)
lib
lib/std/hash.zig
@@ -37,20 +37,79 @@ pub const XxHash3 = xxhash.XxHash3;
 pub const XxHash64 = xxhash.XxHash64;
 pub const XxHash32 = xxhash.XxHash32;
 
+/// Deprecated: use std.hash.int(comptime T, input T) T where T is an unsigned integer type.
 /// This is handy if you have a u32 and want a u32 and don't want to take a
 /// detour through many layers of abstraction elsewhere in the std.hash
 /// namespace.
-/// Copied from https://nullprogram.com/blog/2018/07/31/
 pub fn uint32(input: u32) u32 {
-    var x: u32 = input;
-    x ^= x >> 16;
-    x *%= 0x7feb352d;
-    x ^= x >> 15;
-    x *%= 0x846ca68b;
-    x ^= x >> 16;
+    return int(u32, input);
+}
+
+/// Applies a bit-mangling transformation to an unsigned integer type `T`.
+/// Optimized per type: for `u16` and `u32`, Skeeto's xorshift-multiply; for `u64`, Maiga's mx3.
+/// Falls back on an avalanche pattern for other unsigned types, ensuring high entropy.
+/// Only unsigned types are accepted; signed types will raise a compile-time error.
+pub fn int(comptime T: type, input: T) T {
+    const tInfo = @typeInfo(T).int;
+    if (tInfo.signedness != .unsigned) @compileError("type has to be unsigned integer");
+    var x = input;
+    switch (T) {
+        u16 => {
+            //https://github.com/skeeto/hash-prospector
+            // 3-round xorshift-multiply (-Xn3)
+            // bias = 0.0045976709018820602
+            x = (x ^ (x >> 7)) *% 0x2993;
+            x = (x ^ (x >> 5)) *% 0xe877;
+            x = (x ^ (x >> 9)) *% 0x0235;
+            x = x ^ (x >> 10);
+        },
+        u32 => {
+            // https://github.com/skeeto/hash-prospector
+            x = (x ^ (x >> 17)) *% 0xed5ad4bb;
+            x = (x ^ (x >> 11)) *% 0xac4c1b51;
+            x = (x ^ (x >> 15)) *% 0x31848bab;
+            x = x ^ (x >> 14);
+        },
+        u64 => {
+            // https://github.com/jonmaiga/mx3
+            // https://github.com/jonmaiga/mx3/blob/48924ee743d724aea2cafd2b4249ef8df57fa8b9/mx3.h#L17
+            const C = 0xbea225f9eb34556d;
+            x = (x ^ (x >> 32)) *% C;
+            x = (x ^ (x >> 29)) *% C;
+            x = (x ^ (x >> 32)) *% C;
+            x = x ^ (x >> 29);
+        },
+        else => {
+            // this construction provides robust avalanche properties, but it is not optimal for any given size.
+            const Tsize = @bitSizeOf(T);
+            if (Tsize < 4) @compileError("not implemented.");
+            const hsize = Tsize >> 1;
+            const C = comptime blk: {
+                const max = (1 << Tsize) - 1;
+                var mul = 1;
+                while (mul * 3 < max) mul *= 3;
+                break :blk ((mul ^ (mul >> hsize)) | 1);
+            };
+            inline for (0..2) |_| {
+                x = (x ^ (x >> hsize + 1)) *% C;
+                x = (x ^ (x >> hsize - 1)) *% C;
+            }
+            x ^= (x >> hsize);
+        },
+    }
     return x;
 }
 
+test "bit manglers" {
+    const expect = @import("std").testing.expect;
+    try expect(int(u4, 1) == 0xC);
+    try expect(int(u8, 1) == 0x4F);
+    try expect(int(u16, 1) == 0x2880);
+    try expect(int(u32, 1) == 0x42741D6);
+    try expect(int(u64, 1) == 0x71894DE00D9981F);
+    try expect(int(u128, 1) == 0x50BC2BB18910C3DE0BAA2CE0D0C5B83E);
+}
+
 test {
     _ = adler;
     _ = auto_hash;