Commit 559150e844

Dominic <4678790+dweiller@users.noreply.github.com>
2023-07-24 19:47:45
Xxhash perf (#15947)
Improvements for xxHash performance, both on small keys as well as large slices. * std.hash: make benchmarks consistent for xxhash There is some odd behaviour in the timings for the XXHash benchmarks introduced in 879f0b9, specifically the changes to the allocation in benchmarkHash. The problem is somewhere in the difference between 9628243 and 9362d61 (these are commit that were force-pushed over but can be found in PR #15917). * std.hash: correctly calculate throughput in benchmark * std.hash: add hashes per sec to small key output * std.hash: add exact and small xxhash routines * std.hash: add --small-only flag to benchmark * std.hash.xxhash: extract stateless Accumulator type * std.hash.xxhash: cleanup hash() and improve small key perf * std.hash.xxhash: port xxhash64 changes to xxhash32 * std.hash: change benchmark --small-only flag to --include-array
1 parent d82b359
Changed files (2)
lib/std/hash/benchmark.zig
@@ -18,6 +18,7 @@ const Hash = struct {
     name: []const u8,
     has_iterative_api: bool = true,
     has_crypto_api: bool = false,
+    has_anytype_api: ?[]const comptime_int = null,
     init_u8s: ?[]const u8 = null,
     init_u64: ?u64 = null,
 };
@@ -27,11 +28,13 @@ const hashes = [_]Hash{
         .ty = hash.XxHash64,
         .name = "xxhash64",
         .init_u64 = 0,
+        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
     },
     Hash{
         .ty = hash.XxHash32,
         .name = "xxhash32",
         .init_u64 = 0,
+        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
     },
     Hash{
         .ty = hash.Wyhash,
@@ -99,14 +102,14 @@ const Result = struct {
 };
 
 const block_size: usize = 8 * 8192;
-const alignment: usize = 64;
 
 pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result {
-    const blocks_count = bytes / block_size;
-    var blocks = try allocator.alloc(u8, block_size + alignment * (blocks_count - 1));
+    var blocks = try allocator.alloc(u8, bytes);
     defer allocator.free(blocks);
     random.bytes(blocks);
 
+    const block_count = bytes / block_size;
+
     var h = blk: {
         if (H.init_u8s) |init| {
             break :blk H.ty.init(init[0..H.ty.key_length]);
@@ -118,17 +121,17 @@ pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Alloc
     };
 
     var timer = try Timer.start();
-    const start = timer.lap();
-    for (0..blocks_count) |i| {
-        h.update(blocks[i * alignment ..][0..block_size]);
+    for (0..block_count) |i| {
+        h.update(blocks[i * block_size ..][0..block_size]);
     }
     const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final();
     std.mem.doNotOptimizeAway(final);
 
-    const end = timer.read();
+    const elapsed_ns = timer.read();
 
-    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
-    const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s));
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const size_float: f64 = @floatFromInt(block_size * block_count);
+    const throughput: u64 = @intFromFloat(size_float / elapsed_s);
 
     return Result{
         .hash = final,
@@ -144,7 +147,6 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
     const key_count = bytes / key_size;
 
     var timer = try Timer.start();
-    const start = timer.lap();
 
     var sum: u64 = 0;
     for (0..key_count) |i| {
@@ -164,10 +166,59 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
         };
         sum +%= final;
     }
-    const end = timer.read();
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const size_float: f64 = @floatFromInt(key_count * key_size);
+    const throughput: u64 = @intFromFloat(size_float / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .hash = sum,
+        .throughput = throughput,
+    };
+}
+
+// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
+// if you see strange performance changes consider using `.never_inline` or `.always_inline`
+// to ensure the changes are not only due to the optimiser inlining the benchmark differently
+pub fn benchmarkHashSmallKeysArrayPtr(
+    comptime H: anytype,
+    comptime key_size: usize,
+    bytes: usize,
+    allocator: std.mem.Allocator,
+) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    for (0..key_count) |i| {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hash(init, small_key);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hash(init, small_key);
+            }
+            break :blk H.ty.hash(small_key);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
 
-    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
-    const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s));
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
 
     std.mem.doNotOptimizeAway(sum);
 
@@ -177,6 +228,95 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
     };
 }
 
+// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
+// if you see strange performance changes consider using `.never_inline` or `.always_inline`
+// to ensure the changes are not only due to the optimiser inlining the benchmark differently
+pub fn benchmarkHashSmallKeysArray(
+    comptime H: anytype,
+    comptime key_size: usize,
+    bytes: usize,
+    allocator: std.mem.Allocator,
+) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var i: usize = 0;
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    while (i < key_count) : (i += 1) {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hash(init, small_key.*);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hash(init, small_key.*);
+            }
+            break :blk H.ty.hash(small_key.*);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .hash = sum,
+        .throughput = throughput,
+    };
+}
+
+pub fn benchmarkHashSmallApi(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    for (0..key_count) |i| {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hashSmall(init, small_key);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hashSmall(init, small_key);
+            }
+            break :blk H.ty.hashSmall(small_key);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .throughput = throughput,
+        .hash = sum,
+    };
+}
+
 fn usage() void {
     std.debug.print(
         \\throughput_test [options]
@@ -205,9 +345,12 @@ pub fn main() !void {
 
     var filter: ?[]u8 = "";
     var count: usize = mode(128 * MiB);
-    var key_size: usize = 32;
+    var key_size: ?usize = null;
     var seed: u32 = 0;
     var test_iterative_only = false;
+    var test_arrays = false;
+
+    const default_small_key_size = 32;
 
     var i: usize = 1;
     while (i < args.len) : (i += 1) {
@@ -248,12 +391,14 @@ pub fn main() !void {
             }
 
             key_size = try std.fmt.parseUnsigned(usize, args[i], 10);
-            if (key_size > block_size) {
+            if (key_size.? > block_size) {
                 try stdout.print("key_size cannot exceed block size of {}\n", .{block_size});
                 std.os.exit(1);
             }
         } else if (std.mem.eql(u8, args[i], "--iterative-only")) {
             test_iterative_only = true;
+        } else if (std.mem.eql(u8, args[i], "--include-array")) {
+            test_arrays = true;
         } else if (std.mem.eql(u8, args[i], "--help")) {
             usage();
             return;
@@ -268,7 +413,7 @@ pub fn main() !void {
     const allocator = gpa.allocator();
 
     inline for (hashes) |H| {
-        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) {
+        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) hash: {
             if (!test_iterative_only or H.has_iterative_api) {
                 try stdout.print("{s}\n", .{H.name});
 
@@ -281,9 +426,69 @@ pub fn main() !void {
                 }
 
                 if (!test_iterative_only) {
-                    prng.seed(seed);
-                    const result_small = try benchmarkHashSmallKeys(H, key_size, count, allocator);
-                    try stdout.print("  small keys: {:5} MiB/s [{x:0<16}]\n", .{ result_small.throughput / (1 * MiB), result_small.hash });
+                    if (key_size) |size| {
+                        prng.seed(seed);
+                        const result_small = try benchmarkHashSmallKeys(H, size, count, allocator);
+                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
+                            size,
+                            result_small.throughput / (1 * MiB),
+                            result_small.throughput / size,
+                            result_small.hash,
+                        });
+
+                        if (!test_arrays) break :hash;
+                        if (H.has_anytype_api) |sizes| {
+                            inline for (sizes) |exact_size| {
+                                if (size == exact_size) {
+                                    prng.seed(seed);
+                                    const result_array = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
+                                    prng.seed(seed);
+                                    const result_ptr = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
+                                    try stdout.print("       array: {:5} MiB/s [{x:0<16}]\n", .{
+                                        result_array.throughput / (1 * MiB),
+                                        result_array.hash,
+                                    });
+                                    try stdout.print("   array ptr: {:5} MiB/s [{x:0<16}]\n", .{
+                                        result_ptr.throughput / (1 * MiB),
+                                        result_ptr.hash,
+                                    });
+                                }
+                            }
+                        }
+                    } else {
+                        prng.seed(seed);
+                        const result_small = try benchmarkHashSmallKeys(H, default_small_key_size, count, allocator);
+                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
+                            default_small_key_size,
+                            result_small.throughput / (1 * MiB),
+                            result_small.throughput / default_small_key_size,
+                            result_small.hash,
+                        });
+
+                        if (!test_arrays) break :hash;
+                        if (H.has_anytype_api) |sizes| {
+                            try stdout.print("       array:\n", .{});
+                            inline for (sizes) |exact_size| {
+                                prng.seed(seed);
+                                const result = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
+                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
+                                    exact_size,
+                                    result.throughput / (1 * MiB),
+                                    result.hash,
+                                });
+                            }
+                            try stdout.print("   array ptr: \n", .{});
+                            inline for (sizes) |exact_size| {
+                                prng.seed(seed);
+                                const result = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
+                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
+                                    exact_size,
+                                    result.throughput / (1 * MiB),
+                                    result.hash,
+                                });
+                            }
+                        }
+                    }
                 }
             }
         }
lib/std/hash/xxhash.zig
@@ -5,11 +5,7 @@ const expectEqual = std.testing.expectEqual;
 const rotl = std.math.rotl;
 
 pub const XxHash64 = struct {
-    acc1: u64,
-    acc2: u64,
-    acc3: u64,
-    acc4: u64,
-
+    accumulator: Accumulator,
     seed: u64,
     buf: [32]u8,
     buf_len: usize,
@@ -21,20 +17,174 @@ pub const XxHash64 = struct {
     const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011
     const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101
 
+    const Accumulator = struct {
+        acc1: u64,
+        acc2: u64,
+        acc3: u64,
+        acc4: u64,
+
+        fn init(seed: u64) Accumulator {
+            return .{
+                .acc1 = seed +% prime_1 +% prime_2,
+                .acc2 = seed +% prime_2,
+                .acc3 = seed,
+                .acc4 = seed -% prime_1,
+            };
+        }
+
+        fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
+            var i: usize = 0;
+
+            if (unroll_count > 0) {
+                const unrolled_bytes = unroll_count * 32;
+                while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
+                    inline for (0..unroll_count) |j| {
+                        self.processStripe(input[i + j * 32 ..][0..32]);
+                    }
+                }
+            }
+
+            while (i + 32 <= input.len) : (i += 32) {
+                self.processStripe(input[i..][0..32]);
+            }
+
+            return i;
+        }
+
+        fn processStripe(self: *Accumulator, buf: *const [32]u8) void {
+            self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
+            self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
+            self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
+            self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
+        }
+
+        fn merge(self: Accumulator) u64 {
+            var acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
+                rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
+            acc = mergeAccumulator(acc, self.acc1);
+            acc = mergeAccumulator(acc, self.acc2);
+            acc = mergeAccumulator(acc, self.acc3);
+            acc = mergeAccumulator(acc, self.acc4);
+            return acc;
+        }
+
+        fn mergeAccumulator(acc: u64, other: u64) u64 {
+            const a = acc ^ round(0, other);
+            const b = a *% prime_1;
+            return b +% prime_4;
+        }
+    };
+
+    fn finalize(
+        unfinished: u64,
+        byte_count: usize,
+        partial: anytype,
+    ) u64 {
+        std.debug.assert(partial.len < 32);
+        var acc = unfinished +% @as(u64, byte_count) +% @as(u64, partial.len);
+
+        switch (partial.len) {
+            inline 0, 1, 2, 3 => |count| {
+                inline for (0..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 4, 5, 6, 7 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                inline for (4..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 8, 9, 10, 11 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                inline for (8..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 12, 13, 14, 15 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize4(acc, partial[8..12]);
+                inline for (12..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 16, 17, 18, 19 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                inline for (16..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 20, 21, 22, 23 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                acc = finalize4(acc, partial[16..20]);
+                inline for (20..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 24, 25, 26, 27 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                acc = finalize8(acc, partial[16..24]);
+                inline for (24..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 28, 29, 30, 31 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                acc = finalize8(acc, partial[16..24]);
+                acc = finalize4(acc, partial[24..28]);
+                inline for (28..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            else => unreachable,
+        }
+    }
+
+    fn finalize8(v: u64, bytes: *const [8]u8) u64 {
+        var acc = v;
+        const lane = mem.readIntLittle(u64, bytes);
+        acc ^= round(0, lane);
+        acc = rotl(u64, acc, 27) *% prime_1;
+        acc +%= prime_4;
+        return acc;
+    }
+
+    fn finalize4(v: u64, bytes: *const [4]u8) u64 {
+        var acc = v;
+        const lane = @as(u64, mem.readIntLittle(u32, bytes));
+        acc ^= lane *% prime_1;
+        acc = rotl(u64, acc, 23) *% prime_2;
+        acc +%= prime_3;
+        return acc;
+    }
+
+    fn finalize1(v: u64, byte: u8) u64 {
+        var acc = v;
+        const lane = @as(u64, byte);
+        acc ^= lane *% prime_5;
+        acc = rotl(u64, acc, 11) *% prime_1;
+        return acc;
+    }
+
+    fn avalanche(value: u64) u64 {
+        var result = value ^ (value >> 33);
+        result *%= prime_2;
+        result ^= result >> 29;
+        result *%= prime_3;
+        result ^= result >> 32;
+
+        return result;
+    }
+
     pub fn init(seed: u64) XxHash64 {
         return XxHash64{
+            .accumulator = Accumulator.init(seed),
             .seed = seed,
-            .acc1 = seed +% prime_1 +% prime_2,
-            .acc2 = seed +% prime_2,
-            .acc3 = seed,
-            .acc4 = seed -% prime_1,
             .buf = undefined,
             .buf_len = 0,
             .byte_count = 0,
         };
     }
 
-    pub fn update(self: *XxHash64, input: []const u8) void {
+    pub fn update(self: *XxHash64, input: anytype) void {
+        validateType(@TypeOf(input));
+
         if (input.len < 32 - self.buf_len) {
             @memcpy(self.buf[self.buf_len..][0..input.len], input);
             self.buf_len += input.len;
@@ -46,99 +196,54 @@ pub const XxHash64 = struct {
         if (self.buf_len > 0) {
             i = 32 - self.buf_len;
             @memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
-            self.processStripe(&self.buf);
-            self.buf_len = 0;
+            self.accumulator.processStripe(&self.buf);
+            self.byte_count += self.buf_len;
         }
 
-        while (i + 32 <= input.len) : (i += 32) {
-            self.processStripe(input[i..][0..32]);
-        }
+        i += self.accumulator.updateEmpty(input[i..], 32);
+        self.byte_count += i;
 
         const remaining_bytes = input[i..];
         @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
         self.buf_len = remaining_bytes.len;
     }
 
-    inline fn processStripe(self: *XxHash64, buf: *const [32]u8) void {
-        self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
-        self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
-        self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
-        self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
-        self.byte_count += 32;
-    }
-
-    inline fn round(acc: u64, lane: u64) u64 {
+    fn round(acc: u64, lane: u64) u64 {
         const a = acc +% (lane *% prime_2);
         const b = rotl(u64, a, 31);
         return b *% prime_1;
     }
 
     pub fn final(self: *XxHash64) u64 {
-        var acc: u64 = undefined;
-
-        if (self.byte_count < 32) {
-            acc = self.seed +% prime_5;
-        } else {
-            acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
-                rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
-            acc = mergeAccumulator(acc, self.acc1);
-            acc = mergeAccumulator(acc, self.acc2);
-            acc = mergeAccumulator(acc, self.acc3);
-            acc = mergeAccumulator(acc, self.acc4);
-        }
+        const unfinished = if (self.byte_count < 32)
+            self.seed +% prime_5
+        else
+            self.accumulator.merge();
 
-        acc = acc +% @as(u64, self.byte_count) +% @as(u64, self.buf_len);
+        return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
+    }
 
-        var pos: usize = 0;
-        while (pos + 8 <= self.buf_len) : (pos += 8) {
-            const lane = mem.readIntLittle(u64, self.buf[pos..][0..8]);
-            acc ^= round(0, lane);
-            acc = rotl(u64, acc, 27) *% prime_1;
-            acc +%= prime_4;
-        }
+    const Size = enum {
+        small,
+        large,
+        unknown,
+    };
 
-        if (pos + 4 <= self.buf_len) {
-            const lane = @as(u64, mem.readIntLittle(u32, self.buf[pos..][0..4]));
-            acc ^= lane *% prime_1;
-            acc = rotl(u64, acc, 23) *% prime_2;
-            acc +%= prime_3;
-            pos += 4;
-        }
+    pub fn hash(seed: u64, input: anytype) u64 {
+        validateType(@TypeOf(input));
 
-        while (pos < self.buf_len) : (pos += 1) {
-            const lane = @as(u64, self.buf[pos]);
-            acc ^= lane *% prime_5;
-            acc = rotl(u64, acc, 11) *% prime_1;
+        if (input.len < 32) {
+            return finalize(seed +% prime_5, 0, input);
+        } else {
+            var hasher = Accumulator.init(seed);
+            const i = hasher.updateEmpty(input, 0);
+            return finalize(hasher.merge(), i, input[i..]);
         }
-
-        acc ^= acc >> 33;
-        acc *%= prime_2;
-        acc ^= acc >> 29;
-        acc *%= prime_3;
-        acc ^= acc >> 32;
-
-        return acc;
-    }
-
-    inline fn mergeAccumulator(acc: u64, other: u64) u64 {
-        const a = acc ^ round(0, other);
-        const b = a *% prime_1;
-        return b +% prime_4;
-    }
-
-    pub fn hash(seed: u64, input: []const u8) u64 {
-        var hasher = XxHash64.init(seed);
-        hasher.update(input);
-        return hasher.final();
     }
 };
 
 pub const XxHash32 = struct {
-    acc1: u32,
-    acc2: u32,
-    acc3: u32,
-    acc4: u32,
-
+    accumulator: Accumulator,
     seed: u32,
     buf: [16]u8,
     buf_len: usize,
@@ -150,13 +255,57 @@ pub const XxHash32 = struct {
     const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
     const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001
 
+    const Accumulator = struct {
+        acc1: u32,
+        acc2: u32,
+        acc3: u32,
+        acc4: u32,
+
+        fn init(seed: u32) Accumulator {
+            return .{
+                .acc1 = seed +% prime_1 +% prime_2,
+                .acc2 = seed +% prime_2,
+                .acc3 = seed,
+                .acc4 = seed -% prime_1,
+            };
+        }
+
+        fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
+            var i: usize = 0;
+
+            if (unroll_count > 0) {
+                const unrolled_bytes = unroll_count * 16;
+                while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
+                    inline for (0..unroll_count) |j| {
+                        self.processStripe(input[i + j * 16 ..][0..16]);
+                    }
+                }
+            }
+
+            while (i + 16 <= input.len) : (i += 16) {
+                self.processStripe(input[i..][0..16]);
+            }
+
+            return i;
+        }
+
+        fn processStripe(self: *Accumulator, buf: *const [16]u8) void {
+            self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
+            self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
+            self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
+            self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
+        }
+
+        fn merge(self: Accumulator) u32 {
+            return rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
+                rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
+        }
+    };
+
     pub fn init(seed: u32) XxHash32 {
         return XxHash32{
+            .accumulator = Accumulator.init(seed),
             .seed = seed,
-            .acc1 = seed +% prime_1 +% prime_2,
-            .acc2 = seed +% prime_2,
-            .acc3 = seed,
-            .acc4 = seed -% prime_1,
             .buf = undefined,
             .buf_len = 0,
             .byte_count = 0,
@@ -164,6 +313,8 @@ pub const XxHash32 = struct {
     }
 
     pub fn update(self: *XxHash32, input: []const u8) void {
+        validateType(@TypeOf(input));
+
         if (input.len < 16 - self.buf_len) {
             @memcpy(self.buf[self.buf_len..][0..input.len], input);
             self.buf_len += input.len;
@@ -175,59 +326,85 @@ pub const XxHash32 = struct {
         if (self.buf_len > 0) {
             i = 16 - self.buf_len;
             @memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
-            self.processStripe(&self.buf);
+            self.accumulator.processStripe(&self.buf);
+            self.byte_count += self.buf_len;
             self.buf_len = 0;
         }
 
-        while (i + 16 <= input.len) : (i += 16) {
-            self.processStripe(input[i..][0..16]);
-        }
+        i += self.accumulator.updateEmpty(input[i..], 16);
+        self.byte_count += i;
 
         const remaining_bytes = input[i..];
         @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
         self.buf_len = remaining_bytes.len;
     }
 
-    inline fn processStripe(self: *XxHash32, buf: *const [16]u8) void {
-        self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
-        self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
-        self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
-        self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
-        self.byte_count += 16;
-    }
-
-    inline fn round(acc: u32, lane: u32) u32 {
+    fn round(acc: u32, lane: u32) u32 {
         const a = acc +% (lane *% prime_2);
         const b = rotl(u32, a, 13);
         return b *% prime_1;
     }
 
     pub fn final(self: *XxHash32) u32 {
-        var acc: u32 = undefined;
+        const unfinished = if (self.byte_count < 16)
+            self.seed +% prime_5
+        else
+            self.accumulator.merge();
 
-        if (self.byte_count < 16) {
-            acc = self.seed +% prime_5;
-        } else {
-            acc = rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
-                rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
+        return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
+    }
+
+    fn finalize(unfinished: u32, byte_count: usize, partial: anytype) u32 {
+        std.debug.assert(partial.len < 16);
+        var acc = unfinished +% @as(u32, @intCast(byte_count)) +% @as(u32, @intCast(partial.len));
+
+        switch (partial.len) {
+            inline 0, 1, 2, 3 => |count| {
+                inline for (0..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 4, 5, 6, 7 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                inline for (4..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 8, 9, 10, 11 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                acc = finalize4(acc, partial[4..8]);
+                inline for (8..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 12, 13, 14, 15 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                acc = finalize4(acc, partial[4..8]);
+                acc = finalize4(acc, partial[8..12]);
+                inline for (12..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            else => unreachable,
         }
 
-        acc = acc +% @as(u32, @intCast(self.byte_count)) +% @as(u32, @intCast(self.buf_len));
+        return avalanche(acc);
+    }
 
-        var pos: usize = 0;
-        while (pos + 4 <= self.buf_len) : (pos += 4) {
-            const lane = mem.readIntLittle(u32, self.buf[pos..][0..4]);
-            acc +%= lane *% prime_3;
-            acc = rotl(u32, acc, 17) *% prime_4;
-        }
+    fn finalize4(v: u32, bytes: *const [4]u8) u32 {
+        var acc = v;
+        const lane = mem.readIntLittle(u32, bytes);
+        acc +%= lane *% prime_3;
+        acc = rotl(u32, acc, 17) *% prime_4;
+        return acc;
+    }
 
-        while (pos < self.buf_len) : (pos += 1) {
-            const lane = @as(u32, self.buf[pos]);
-            acc +%= lane *% prime_5;
-            acc = rotl(u32, acc, 11) *% prime_1;
-        }
+    fn finalize1(v: u32, byte: u8) u32 {
+        var acc = v;
+        const lane = @as(u32, byte);
+        acc +%= lane *% prime_5;
+        acc = rotl(u32, acc, 11) *% prime_1;
+        return acc;
+    }
 
-        acc ^= acc >> 15;
+    fn avalanche(value: u32) u32 {
+        var acc = value ^ value >> 15;
         acc *%= prime_2;
         acc ^= acc >> 13;
         acc *%= prime_3;
@@ -236,33 +413,58 @@ pub const XxHash32 = struct {
         return acc;
     }
 
-    pub fn hash(seed: u32, input: []const u8) u32 {
-        var hasher = XxHash32.init(seed);
-        hasher.update(input);
-        return hasher.final();
+    pub fn hash(seed: u32, input: anytype) u32 {
+        validateType(@TypeOf(input));
+
+        if (input.len < 16) {
+            return finalize(seed +% prime_5, 0, input);
+        } else {
+            var hasher = Accumulator.init(seed);
+            const i = hasher.updateEmpty(input, 0);
+            return finalize(hasher.merge(), i, input[i..]);
+        }
     }
 };
 
+fn validateType(comptime T: type) void {
+    comptime {
+        if (!((std.meta.trait.isSlice(T) or
+            std.meta.trait.is(.Array)(T) or
+            std.meta.trait.isPtrTo(.Array)(T)) and
+            std.meta.Elem(T) == u8))
+        {
+            @compileError("expect a slice, array or pointer to array of u8, got " ++ @typeName(T));
+        }
+    }
+}
+
+fn testExpect(comptime H: type, seed: anytype, input: []const u8, expected: u64) !void {
+    try expectEqual(expected, H.hash(0, input));
+
+    var hasher = H.init(seed);
+    hasher.update(input);
+    try expectEqual(expected, hasher.final());
+}
+
 test "xxhash64" {
-    const hash = XxHash64.hash;
-
-    try expectEqual(hash(0, ""), 0xef46db3751d8e999);
-    try expectEqual(hash(0, "a"), 0xd24ec4f1a98c6e5b);
-    try expectEqual(hash(0, "abc"), 0x44bc2cf5ad770999);
-    try expectEqual(hash(0, "message digest"), 0x066ed728fceeb3be);
-    try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0xcfe1f278fa89835c);
-    try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0xaaa46907d3047814);
-    try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0xe04a477f19ee145d);
+    const H = XxHash64;
+    try testExpect(H, 0, "", 0xef46db3751d8e999);
+    try testExpect(H, 0, "a", 0xd24ec4f1a98c6e5b);
+    try testExpect(H, 0, "abc", 0x44bc2cf5ad770999);
+    try testExpect(H, 0, "message digest", 0x066ed728fceeb3be);
+    try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0xcfe1f278fa89835c);
+    try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0xaaa46907d3047814);
+    try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0xe04a477f19ee145d);
 }
 
 test "xxhash32" {
-    const hash = XxHash32.hash;
-
-    try expectEqual(hash(0, ""), 0x02cc5d05);
-    try expectEqual(hash(0, "a"), 0x550d7456);
-    try expectEqual(hash(0, "abc"), 0x32d153ff);
-    try expectEqual(hash(0, "message digest"), 0x7c948494);
-    try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0x63a14d5f);
-    try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x9c285e64);
-    try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x9c05f475);
+    const H = XxHash32;
+
+    try testExpect(H, 0, "", 0x02cc5d05);
+    try testExpect(H, 0, "a", 0x550d7456);
+    try testExpect(H, 0, "abc", 0x32d153ff);
+    try testExpect(H, 0, "message digest", 0x7c948494);
+    try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0x63a14d5f);
+    try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0x9c285e64);
+    try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0x9c05f475);
 }