master
   1const std = @import("std");
   2const builtin = @import("builtin");
   3const crypto = std.crypto;
   4const Allocator = std.mem.Allocator;
   5const Io = std.Io;
   6const Thread = std.Thread;
   7
   8const TurboSHAKE128State = crypto.hash.sha3.TurboShake128(0x06);
   9const TurboSHAKE256State = crypto.hash.sha3.TurboShake256(0x06);
  10
  11const chunk_size: usize = 8192; // Chunk size for tree hashing (8 KiB)
  12const cache_line_size = std.atomic.cache_line;
  13
  14// Optimal SIMD vector length for u64 on this target platform
  15const optimal_vector_len = std.simd.suggestVectorLength(u64) orelse 1;
  16
  17// Number of bytes processed per SIMD batch in multi-threaded mode
  18const bytes_per_batch = 256 * 1024;
  19
  20// Multi-threading threshold: inputs larger than this will use parallel processing.
  21// Benchmarked optimal value for ReleaseFast mode.
  22const large_file_threshold: usize = 2 * 1024 * 1024; // 2 MB
  23
  24// Round constants for Keccak-p[1600,12]
  25const RC = [12]u64{
  26    0x000000008000808B,
  27    0x800000000000008B,
  28    0x8000000000008089,
  29    0x8000000000008003,
  30    0x8000000000008002,
  31    0x8000000000000080,
  32    0x000000000000800A,
  33    0x800000008000000A,
  34    0x8000000080008081,
  35    0x8000000000008080,
  36    0x0000000080000001,
  37    0x8000000080008008,
  38};
  39
  40/// Generic KangarooTwelve variant builder.
  41/// Creates a variant type with specific cryptographic parameters.
  42fn KangarooVariant(
  43    comptime security_level_bits: comptime_int,
  44    comptime rate_bytes: usize,
  45    comptime cv_size_bytes: usize,
  46    comptime StateTypeParam: type,
  47    comptime sep_x: usize,
  48    comptime sep_y: usize,
  49    comptime pad_x: usize,
  50    comptime pad_y: usize,
  51    comptime toBufferFn: fn (*const MultiSliceView, u8, []u8) void,
  52    comptime allocFn: fn (Allocator, *const MultiSliceView, u8, usize) anyerror![]u8,
  53) type {
  54    return struct {
  55        const security_level = security_level_bits;
  56        const rate = rate_bytes;
  57        const rate_in_lanes = rate_bytes / 8;
  58        const cv_size = cv_size_bytes;
  59        const StateType = StateTypeParam;
  60        const separation_byte_pos = .{ .x = sep_x, .y = sep_y };
  61        const padding_pos = .{ .x = pad_x, .y = pad_y };
  62
  63        inline fn turboShakeToBuffer(view: *const MultiSliceView, separation_byte: u8, output: []u8) void {
  64            toBufferFn(view, separation_byte, output);
  65        }
  66
  67        inline fn turboShakeMultiSliceAlloc(
  68            allocator: Allocator,
  69            view: *const MultiSliceView,
  70            separation_byte: u8,
  71            output_len: usize,
  72        ) ![]u8 {
  73            return allocFn(allocator, view, separation_byte, output_len);
  74        }
  75    };
  76}
  77
  78/// KangarooTwelve with 128-bit security parameters
  79const KT128Variant = KangarooVariant(
  80    128, // Security level in bits
  81    168, // TurboSHAKE128 rate in bytes
  82    32, // Chaining value size in bytes
  83    TurboSHAKE128State,
  84    1, // separation_byte_pos.x (lane 11: 88 bytes into 168-byte rate)
  85    3, // separation_byte_pos.y
  86    0, // padding_pos.x (lane 20: last lane of 168-byte rate)
  87    4, // padding_pos.y
  88    turboShake128MultiSliceToBuffer,
  89    turboShake128MultiSlice,
  90);
  91
  92/// KangarooTwelve with 256-bit security parameters
  93const KT256Variant = KangarooVariant(
  94    256, // Security level in bits
  95    136, // TurboSHAKE256 rate in bytes
  96    64, // Chaining value size in bytes
  97    TurboSHAKE256State,
  98    4, // separation_byte_pos.x (lane 4: 32 bytes into 136-byte rate)
  99    0, // separation_byte_pos.y
 100    1, // padding_pos.x (lane 16: last lane of 136-byte rate)
 101    3, // padding_pos.y
 102    turboShake256MultiSliceToBuffer,
 103    turboShake256MultiSlice,
 104);
 105
 106/// Rotate left for u64 vector
 107inline fn rol64Vec(comptime N: usize, v: @Vector(N, u64), comptime n: u6) @Vector(N, u64) {
 108    if (n == 0) return v;
 109    const left: @Vector(N, u64) = @splat(n);
 110    const right_shift: u64 = 64 - @as(u64, n);
 111    const right: @Vector(N, u64) = @splat(right_shift);
 112    return (v << left) | (v >> right);
 113}
 114
 115/// Load a 64-bit little-endian value
 116inline fn load64(bytes: []const u8) u64 {
 117    return std.mem.readInt(u64, bytes[0..8], .little);
 118}
 119
 120/// Store a 64-bit little-endian value
 121inline fn store64(value: u64, bytes: []u8) void {
 122    std.mem.writeInt(u64, bytes[0..8], value, .little);
 123}
 124
 125/// Right-encode result type (max 9 bytes for 64-bit usize)
 126const RightEncoded = struct {
 127    bytes: [9]u8,
 128    len: u8,
 129
 130    fn slice(self: *const RightEncoded) []const u8 {
 131        return self.bytes[0..self.len];
 132    }
 133};
 134
 135/// Right-encode: encodes a number as bytes with length suffix (no allocation)
 136fn rightEncode(x: usize) RightEncoded {
 137    var result: RightEncoded = undefined;
 138
 139    if (x == 0) {
 140        result.bytes[0] = 0;
 141        result.len = 1;
 142        return result;
 143    }
 144
 145    var temp: [9]u8 = undefined;
 146    var len: usize = 0;
 147    var val = x;
 148
 149    while (val > 0) : (val /= 256) {
 150        temp[len] = @intCast(val % 256);
 151        len += 1;
 152    }
 153
 154    // Reverse bytes (MSB first)
 155    for (0..len) |i| {
 156        result.bytes[i] = temp[len - 1 - i];
 157    }
 158    result.bytes[len] = @intCast(len);
 159    result.len = @intCast(len + 1);
 160
 161    return result;
 162}
 163
 164/// Virtual contiguous view over multiple slices (zero-copy)
 165const MultiSliceView = struct {
 166    slices: [3][]const u8,
 167    offsets: [4]usize,
 168
 169    fn init(s1: []const u8, s2: []const u8, s3: []const u8) MultiSliceView {
 170        return .{
 171            .slices = .{ s1, s2, s3 },
 172            .offsets = .{
 173                0,
 174                s1.len,
 175                s1.len + s2.len,
 176                s1.len + s2.len + s3.len,
 177            },
 178        };
 179    }
 180
 181    fn totalLen(self: *const MultiSliceView) usize {
 182        return self.offsets[3];
 183    }
 184
 185    /// Get byte at position (zero-copy)
 186    fn getByte(self: *const MultiSliceView, pos: usize) u8 {
 187        for (0..3) |i| {
 188            if (pos >= self.offsets[i] and pos < self.offsets[i + 1]) {
 189                return self.slices[i][pos - self.offsets[i]];
 190            }
 191        }
 192        unreachable;
 193    }
 194
 195    /// Try to get a contiguous slice [start..end) - returns null if spans boundaries
 196    fn tryGetSlice(self: *const MultiSliceView, start: usize, end: usize) ?[]const u8 {
 197        for (0..3) |i| {
 198            if (start >= self.offsets[i] and end <= self.offsets[i + 1]) {
 199                const local_start = start - self.offsets[i];
 200                const local_end = end - self.offsets[i];
 201                return self.slices[i][local_start..local_end];
 202            }
 203        }
 204        return null;
 205    }
 206
 207    /// Copy range [start..end) to buffer (used when slice spans boundaries)
 208    fn copyRange(self: *const MultiSliceView, start: usize, end: usize, buffer: []u8) void {
 209        var pos: usize = 0;
 210        for (start..end) |i| {
 211            buffer[pos] = self.getByte(i);
 212            pos += 1;
 213        }
 214    }
 215};
 216
 217/// Apply Keccak-p[1600,12] to N states using SIMD
 218fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
 219    @setEvalBranchQuota(10000);
 220
 221    // Pre-computed rotation offsets for rho-pi step
 222    const rho_offsets = comptime blk: {
 223        var offsets: [24]u6 = undefined;
 224        var px: usize = 1;
 225        var py: usize = 0;
 226        for (0..24) |t| {
 227            const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
 228            offsets[t] = @intCast(rot_amount);
 229            const temp_x = py;
 230            py = (2 * px + 3 * py) % 5;
 231            px = temp_x;
 232        }
 233        break :blk offsets;
 234    };
 235
 236    var round: usize = 0;
 237    while (round < 12) : (round += 2) {
 238        inline for (0..2) |i| {
 239            // θ (theta)
 240            var C: [5]@Vector(N, u64) = undefined;
 241            inline for (0..5) |x| {
 242                C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4];
 243            }
 244
 245            var D: [5]@Vector(N, u64) = undefined;
 246            inline for (0..5) |x| {
 247                D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1);
 248            }
 249
 250            // Apply D to all lanes
 251            inline for (0..5) |x| {
 252                states[x][0] ^= D[x];
 253                states[x][1] ^= D[x];
 254                states[x][2] ^= D[x];
 255                states[x][3] ^= D[x];
 256                states[x][4] ^= D[x];
 257            }
 258
 259            // ρ (rho) and π (pi) - optimized with pre-computed offsets
 260            var current = states[1][0];
 261            var px: usize = 1;
 262            var py: usize = 0;
 263            inline for (rho_offsets) |rot| {
 264                const next_y = (2 * px + 3 * py) % 5;
 265                const next = states[py][next_y];
 266                states[py][next_y] = rol64Vec(N, current, rot);
 267                current = next;
 268                px = py;
 269                py = next_y;
 270            }
 271
 272            // χ (chi) - optimized with better register usage
 273            inline for (0..5) |y| {
 274                const t0 = states[0][y];
 275                const t1 = states[1][y];
 276                const t2 = states[2][y];
 277                const t3 = states[3][y];
 278                const t4 = states[4][y];
 279
 280                states[0][y] = t0 ^ (~t1 & t2);
 281                states[1][y] = t1 ^ (~t2 & t3);
 282                states[2][y] = t2 ^ (~t3 & t4);
 283                states[3][y] = t3 ^ (~t4 & t0);
 284                states[4][y] = t4 ^ (~t0 & t1);
 285            }
 286
 287            // ι (iota)
 288            const rc_splat: @Vector(N, u64) = @splat(RC[round + i]);
 289            states[0][0] ^= rc_splat;
 290        }
 291    }
 292}
 293
 294/// Add lanes from data to N states in parallel with stride using SIMD
 295fn addLanesAll(
 296    comptime N: usize,
 297    states: *[5][5]@Vector(N, u64),
 298    data: []const u8,
 299    lane_count: usize,
 300    lane_offset: usize,
 301) void {
 302
 303    // Process lanes (at most 25 lanes in Keccak state)
 304    inline for (0..25) |xy| {
 305        if (xy < lane_count) {
 306            const x = xy % 5;
 307            const y = xy / 5;
 308
 309            var loaded_data: @Vector(N, u64) = undefined;
 310            inline for (0..N) |i| {
 311                loaded_data[i] = load64(data[8 * (i * lane_offset + xy) ..]);
 312            }
 313            states[x][y] ^= loaded_data;
 314        }
 315    }
 316}
 317
 318/// Apply Keccak-p[1600,12] to a single state (byte representation)
 319fn keccakP(state: *[200]u8) void {
 320    @setEvalBranchQuota(10000);
 321    var lanes: [5][5]u64 = undefined;
 322
 323    // Load state into lanes
 324    inline for (0..5) |x| {
 325        inline for (0..5) |y| {
 326            lanes[x][y] = load64(state[8 * (x + 5 * y) ..]);
 327        }
 328    }
 329
 330    // Apply 12 rounds
 331    var round: usize = 0;
 332    while (round < 12) : (round += 2) {
 333        inline for (0..2) |i| {
 334            // θ
 335            var C: [5]u64 = undefined;
 336            inline for (0..5) |x| {
 337                C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4];
 338            }
 339            var D: [5]u64 = undefined;
 340            inline for (0..5) |x| {
 341                D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1);
 342            }
 343            inline for (0..5) |x| {
 344                inline for (0..5) |y| {
 345                    lanes[x][y] ^= D[x];
 346                }
 347            }
 348
 349            // ρ and π
 350            var current = lanes[1][0];
 351            var px: usize = 1;
 352            var py: usize = 0;
 353            inline for (0..24) |t| {
 354                const temp = lanes[py][(2 * px + 3 * py) % 5];
 355                const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
 356                lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount)));
 357                current = temp;
 358                const temp_x = py;
 359                py = (2 * px + 3 * py) % 5;
 360                px = temp_x;
 361            }
 362
 363            // χ
 364            inline for (0..5) |y| {
 365                const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] };
 366                inline for (0..5) |x| {
 367                    lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]);
 368                }
 369            }
 370
 371            // ι
 372            lanes[0][0] ^= RC[round + i];
 373        }
 374    }
 375
 376    // Store lanes back to state
 377    inline for (0..5) |x| {
 378        inline for (0..5) |y| {
 379            store64(lanes[x][y], state[8 * (x + 5 * y) ..]);
 380        }
 381    }
 382}
 383
 384/// Apply Keccak-p[1600,12] to a single state (u64 lane representation)
 385fn keccakPLanes(lanes: *[25]u64) void {
 386    @setEvalBranchQuota(10000);
 387
 388    // Apply 12 rounds
 389    inline for (RC) |rc| {
 390        // θ
 391        var C: [5]u64 = undefined;
 392        inline for (0..5) |x| {
 393            C[x] = lanes[x] ^ lanes[x + 5] ^ lanes[x + 10] ^ lanes[x + 15] ^ lanes[x + 20];
 394        }
 395        var D: [5]u64 = undefined;
 396        inline for (0..5) |x| {
 397            D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1);
 398        }
 399        inline for (0..5) |x| {
 400            inline for (0..5) |y| {
 401                lanes[x + 5 * y] ^= D[x];
 402            }
 403        }
 404
 405        // ρ and π
 406        var current = lanes[1];
 407        var px: usize = 1;
 408        var py: usize = 0;
 409        inline for (0..24) |t| {
 410            const next_y = (2 * px + 3 * py) % 5;
 411            const next_idx = py + 5 * next_y;
 412            const temp = lanes[next_idx];
 413            const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
 414            lanes[next_idx] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount)));
 415            current = temp;
 416            px = py;
 417            py = next_y;
 418        }
 419
 420        // χ
 421        inline for (0..5) |y| {
 422            const idx = 5 * y;
 423            const T = [5]u64{ lanes[idx], lanes[idx + 1], lanes[idx + 2], lanes[idx + 3], lanes[idx + 4] };
 424            inline for (0..5) |x| {
 425                lanes[idx + x] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]);
 426            }
 427        }
 428
 429        // ι
 430        lanes[0] ^= rc;
 431    }
 432}
 433
 434/// Generic non-allocating TurboSHAKE: write output to provided buffer
 435fn turboShakeMultiSliceToBuffer(
 436    comptime rate: usize,
 437    view: *const MultiSliceView,
 438    separation_byte: u8,
 439    output: []u8,
 440) void {
 441    var state: [200]u8 = @splat(0);
 442    var state_pos: usize = 0;
 443
 444    // Absorb all bytes from the multi-slice view
 445    const total = view.totalLen();
 446    var pos: usize = 0;
 447    while (pos < total) {
 448        state[state_pos] ^= view.getByte(pos);
 449        state_pos += 1;
 450        pos += 1;
 451
 452        if (state_pos == rate) {
 453            keccakP(&state);
 454            state_pos = 0;
 455        }
 456    }
 457
 458    // Add separation byte and padding
 459    state[state_pos] ^= separation_byte;
 460    state[rate - 1] ^= 0x80;
 461    keccakP(&state);
 462
 463    // Squeeze
 464    var out_offset: usize = 0;
 465    while (out_offset < output.len) {
 466        const chunk = @min(rate, output.len - out_offset);
 467        @memcpy(output[out_offset..][0..chunk], state[0..chunk]);
 468        out_offset += chunk;
 469        if (out_offset < output.len) {
 470            keccakP(&state);
 471        }
 472    }
 473}
 474
 475/// Generic allocating TurboSHAKE
 476fn turboShakeMultiSlice(
 477    comptime rate: usize,
 478    allocator: Allocator,
 479    view: *const MultiSliceView,
 480    separation_byte: u8,
 481    output_len: usize,
 482) ![]u8 {
 483    const output = try allocator.alloc(u8, output_len);
 484    turboShakeMultiSliceToBuffer(rate, view, separation_byte, output);
 485    return output;
 486}
 487
 488/// Non-allocating TurboSHAKE128: write output to provided buffer
 489fn turboShake128MultiSliceToBuffer(
 490    view: *const MultiSliceView,
 491    separation_byte: u8,
 492    output: []u8,
 493) void {
 494    turboShakeMultiSliceToBuffer(168, view, separation_byte, output);
 495}
 496
 497/// Allocating TurboSHAKE128
 498fn turboShake128MultiSlice(
 499    allocator: Allocator,
 500    view: *const MultiSliceView,
 501    separation_byte: u8,
 502    output_len: usize,
 503) ![]u8 {
 504    return turboShakeMultiSlice(168, allocator, view, separation_byte, output_len);
 505}
 506
 507/// Non-allocating TurboSHAKE256: write output to provided buffer
 508fn turboShake256MultiSliceToBuffer(
 509    view: *const MultiSliceView,
 510    separation_byte: u8,
 511    output: []u8,
 512) void {
 513    turboShakeMultiSliceToBuffer(136, view, separation_byte, output);
 514}
 515
 516/// Allocating TurboSHAKE256
 517fn turboShake256MultiSlice(
 518    allocator: Allocator,
 519    view: *const MultiSliceView,
 520    separation_byte: u8,
 521    output_len: usize,
 522) ![]u8 {
 523    return turboShakeMultiSlice(136, allocator, view, separation_byte, output_len);
 524}
 525
 526/// Process N leaves (8KiB chunks) in parallel - generic version
 527fn processLeaves(
 528    comptime Variant: type,
 529    comptime N: usize,
 530    data: []const u8,
 531    result: *[N * Variant.cv_size]u8,
 532) void {
 533    const rate_in_lanes: usize = Variant.rate_in_lanes;
 534    const rate_in_bytes: usize = rate_in_lanes * 8;
 535    const cv_size: usize = Variant.cv_size;
 536
 537    // Initialize N all-zero states with cache alignment
 538    var states: [5][5]@Vector(N, u64) align(cache_line_size) = undefined;
 539    inline for (0..5) |x| {
 540        inline for (0..5) |y| {
 541            states[x][y] = @splat(0);
 542        }
 543    }
 544
 545    // Process complete blocks
 546    var j: usize = 0;
 547    while (j + rate_in_bytes <= chunk_size) : (j += rate_in_bytes) {
 548        addLanesAll(N, &states, data[j..], rate_in_lanes, chunk_size / 8);
 549        keccakP1600timesN(N, &states);
 550    }
 551
 552    // Process last incomplete block
 553    const remaining_lanes = (chunk_size - j) / 8;
 554    if (remaining_lanes > 0) {
 555        addLanesAll(N, &states, data[j..], remaining_lanes, chunk_size / 8);
 556    }
 557
 558    // Add suffix 0x0B and padding
 559    const suffix_pos = Variant.separation_byte_pos;
 560    const padding_pos = Variant.padding_pos;
 561
 562    const suffix_splat: @Vector(N, u64) = @splat(0x0B);
 563    states[suffix_pos.x][suffix_pos.y] ^= suffix_splat;
 564    const padding_splat: @Vector(N, u64) = @splat(0x8000000000000000);
 565    states[padding_pos.x][padding_pos.y] ^= padding_splat;
 566
 567    keccakP1600timesN(N, &states);
 568
 569    // Extract chaining values from each state
 570    const lanes_to_extract = cv_size / 8;
 571    comptime var lane_idx: usize = 0;
 572    inline while (lane_idx < lanes_to_extract) : (lane_idx += 1) {
 573        const x = lane_idx % 5;
 574        const y = lane_idx / 5;
 575        inline for (0..N) |i| {
 576            store64(states[x][y][i], result[i * cv_size + lane_idx * 8 ..]);
 577        }
 578    }
 579}
 580
 581/// Context for processing a batch of leaves in a thread
 582const LeafBatchContext = struct {
 583    output_cvs: []align(@alignOf(u64)) u8,
 584    batch_start: usize,
 585    batch_count: usize,
 586    view: *const MultiSliceView,
 587    scratch_buffer: []u8, // Pre-allocated scratch space (no allocations in worker)
 588    total_len: usize, // Total length of input data (for boundary checking)
 589};
 590
 591/// Helper function to process N leaves in parallel, reducing code duplication
 592inline fn processNLeaves(
 593    comptime Variant: type,
 594    comptime N: usize,
 595    view: *const MultiSliceView,
 596    j: usize,
 597    leaf_buffer: []u8,
 598    output: []align(@alignOf(u64)) u8,
 599) void {
 600    const cv_size = Variant.cv_size;
 601    comptime std.debug.assert(cv_size % @sizeOf(u64) == 0);
 602
 603    if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| {
 604        var leaf_cvs: [N * cv_size]u8 = undefined;
 605        processLeaves(Variant, N, leaf_data, &leaf_cvs);
 606        @memcpy(output[0..leaf_cvs.len], &leaf_cvs);
 607    } else {
 608        view.copyRange(j, j + N * chunk_size, leaf_buffer[0 .. N * chunk_size]);
 609        var leaf_cvs: [N * cv_size]u8 = undefined;
 610        processLeaves(Variant, N, leaf_buffer[0 .. N * chunk_size], &leaf_cvs);
 611        @memcpy(output[0..leaf_cvs.len], &leaf_cvs);
 612    }
 613}
 614
 615/// Process a batch of leaves in a single thread using SIMD
 616fn processLeafBatch(comptime Variant: type, ctx: LeafBatchContext) void {
 617    const cv_size = Variant.cv_size;
 618    const leaf_buffer = ctx.scratch_buffer[0 .. 8 * chunk_size];
 619
 620    var cvs_offset: usize = 0;
 621    var j: usize = ctx.batch_start;
 622    const batch_end = @min(ctx.batch_start + ctx.batch_count * chunk_size, ctx.total_len);
 623
 624    // Process leaves using SIMD (8x, 4x, 2x) based on optimal vector length
 625    inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
 626        while (optimal_vector_len >= batch_size and j + batch_size * chunk_size <= batch_end) {
 627            processNLeaves(Variant, batch_size, ctx.view, j, leaf_buffer, @alignCast(ctx.output_cvs[cvs_offset..]));
 628            cvs_offset += batch_size * cv_size;
 629            j += batch_size * chunk_size;
 630        }
 631    }
 632
 633    // Process remaining single leaves
 634    while (j < batch_end) {
 635        const chunk_len = @min(chunk_size, batch_end - j);
 636        if (ctx.view.tryGetSlice(j, j + chunk_len)) |leaf_data| {
 637            const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
 638            Variant.turboShakeToBuffer(&cv_slice, 0x0B, ctx.output_cvs[cvs_offset..][0..cv_size]);
 639        } else {
 640            ctx.view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]);
 641            const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{});
 642            Variant.turboShakeToBuffer(&cv_slice, 0x0B, ctx.output_cvs[cvs_offset..][0..cv_size]);
 643        }
 644        cvs_offset += cv_size;
 645        j += chunk_len;
 646    }
 647
 648    std.debug.assert(cvs_offset == ctx.output_cvs.len);
 649}
 650
 651/// Helper to process N leaves in SIMD and absorb CVs into state
 652inline fn processAndAbsorbNLeaves(
 653    comptime Variant: type,
 654    comptime N: usize,
 655    view: *const MultiSliceView,
 656    j: usize,
 657    leaf_buffer: []u8,
 658    final_state: anytype,
 659) void {
 660    const cv_size = Variant.cv_size;
 661    if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| {
 662        var leaf_cvs: [N * cv_size]u8 align(cache_line_size) = undefined;
 663        processLeaves(Variant, N, leaf_data, &leaf_cvs);
 664        final_state.update(&leaf_cvs);
 665    } else {
 666        view.copyRange(j, j + N * chunk_size, leaf_buffer[0 .. N * chunk_size]);
 667        var leaf_cvs: [N * cv_size]u8 align(cache_line_size) = undefined;
 668        processLeaves(Variant, N, leaf_buffer[0 .. N * chunk_size], &leaf_cvs);
 669        final_state.update(&leaf_cvs);
 670    }
 671}
 672
 673/// Generic single-threaded implementation
 674fn ktSingleThreaded(comptime Variant: type, view: *const MultiSliceView, total_len: usize, output: []u8) void {
 675    const cv_size = Variant.cv_size;
 676    const StateType = Variant.StateType;
 677
 678    // Initialize streaming TurboSHAKE state for final node (delimiter 0x06 is set in the type)
 679    var final_state = StateType.init(.{});
 680
 681    // Absorb first B bytes from input
 682    var first_b_buffer: [chunk_size]u8 = undefined;
 683    if (view.tryGetSlice(0, chunk_size)) |first_chunk| {
 684        final_state.update(first_chunk);
 685    } else {
 686        view.copyRange(0, chunk_size, &first_b_buffer);
 687        final_state.update(&first_b_buffer);
 688    }
 689
 690    // Absorb padding bytes (8 bytes: 0x03 followed by 7 zeros)
 691    const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
 692    final_state.update(&padding);
 693
 694    var j: usize = chunk_size;
 695    var n: usize = 0;
 696
 697    // Temporary buffers for boundary-spanning leaves and CV computation
 698    var leaf_buffer: [chunk_size * 8]u8 align(cache_line_size) = undefined;
 699    var cv_buffer: [64]u8 = undefined; // Max CV size is 64 bytes
 700
 701    // Process leaves in SIMD batches (8x, 4x, 2x)
 702    inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
 703        while (optimal_vector_len >= batch_size and j + batch_size * chunk_size <= total_len) {
 704            processAndAbsorbNLeaves(Variant, batch_size, view, j, &leaf_buffer, &final_state);
 705            j += batch_size * chunk_size;
 706            n += batch_size;
 707        }
 708    }
 709
 710    // Process remaining leaves one at a time
 711    while (j < total_len) {
 712        const chunk_len = @min(chunk_size, total_len - j);
 713        if (view.tryGetSlice(j, j + chunk_len)) |leaf_data| {
 714            const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
 715            Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 716            final_state.update(cv_buffer[0..cv_size]); // Absorb CV immediately
 717        } else {
 718            view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]);
 719            const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{});
 720            Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 721            final_state.update(cv_buffer[0..cv_size]);
 722        }
 723        j += chunk_size;
 724        n += 1;
 725    }
 726
 727    // Absorb right_encode(n) and terminator
 728    const n_enc = rightEncode(n);
 729    final_state.update(n_enc.slice());
 730    const terminator = [_]u8{ 0xFF, 0xFF };
 731    final_state.update(&terminator);
 732
 733    // Finalize and squeeze output
 734    final_state.final(output);
 735}
 736
 737fn BatchResult(comptime Variant: type) type {
 738    const cv_size = Variant.cv_size;
 739    const leaves_per_batch = bytes_per_batch / chunk_size;
 740    const max_cvs_size = leaves_per_batch * cv_size;
 741
 742    return struct {
 743        batch_idx: usize,
 744        cv_len: usize,
 745        cvs: [max_cvs_size]u8,
 746    };
 747}
 748
 749fn SelectLeafContext(comptime Variant: type) type {
 750    const cv_size = Variant.cv_size;
 751    const Result = BatchResult(Variant);
 752
 753    return struct {
 754        view: *const MultiSliceView,
 755        batch_idx: usize,
 756        start_offset: usize,
 757        num_leaves: usize,
 758
 759        fn process(ctx: @This()) Result {
 760            var result: Result = .{
 761                .batch_idx = ctx.batch_idx,
 762                .cv_len = ctx.num_leaves * cv_size,
 763                .cvs = undefined,
 764            };
 765
 766            var leaf_buffer: [bytes_per_batch]u8 align(cache_line_size) = undefined;
 767            var leaves_processed: usize = 0;
 768            var byte_offset = ctx.start_offset;
 769            var cv_offset: usize = 0;
 770            const simd_batch_bytes = optimal_vector_len * chunk_size;
 771            while (leaves_processed + optimal_vector_len <= ctx.num_leaves) {
 772                if (ctx.view.tryGetSlice(byte_offset, byte_offset + simd_batch_bytes)) |leaf_data| {
 773                    var leaf_cvs: [optimal_vector_len * Variant.cv_size]u8 = undefined;
 774                    processLeaves(Variant, optimal_vector_len, leaf_data, &leaf_cvs);
 775                    @memcpy(result.cvs[cv_offset..][0..leaf_cvs.len], &leaf_cvs);
 776                } else {
 777                    ctx.view.copyRange(byte_offset, byte_offset + simd_batch_bytes, leaf_buffer[0..simd_batch_bytes]);
 778                    var leaf_cvs: [optimal_vector_len * Variant.cv_size]u8 = undefined;
 779                    processLeaves(Variant, optimal_vector_len, leaf_buffer[0..simd_batch_bytes], &leaf_cvs);
 780                    @memcpy(result.cvs[cv_offset..][0..leaf_cvs.len], &leaf_cvs);
 781                }
 782                leaves_processed += optimal_vector_len;
 783                byte_offset += optimal_vector_len * chunk_size;
 784                cv_offset += optimal_vector_len * cv_size;
 785            }
 786
 787            while (leaves_processed < ctx.num_leaves) {
 788                const leaf_end = byte_offset + chunk_size;
 789                var cv_buffer: [64]u8 = undefined;
 790
 791                if (ctx.view.tryGetSlice(byte_offset, leaf_end)) |leaf_data| {
 792                    const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
 793                    Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 794                } else {
 795                    ctx.view.copyRange(byte_offset, leaf_end, leaf_buffer[0..chunk_size]);
 796                    const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_size], &[_]u8{}, &[_]u8{});
 797                    Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 798                }
 799                @memcpy(result.cvs[cv_offset..][0..cv_size], cv_buffer[0..cv_size]);
 800
 801                leaves_processed += 1;
 802                byte_offset += chunk_size;
 803                cv_offset += cv_size;
 804            }
 805
 806            return result;
 807        }
 808    };
 809}
 810
 811fn FinalLeafContext(comptime Variant: type) type {
 812    return struct {
 813        view: *const MultiSliceView,
 814        start_offset: usize,
 815        leaf_len: usize,
 816        output_cv: []align(@alignOf(u64)) u8,
 817
 818        fn process(ctx: @This()) void {
 819            const cv_size = Variant.cv_size;
 820            var leaf_buffer: [chunk_size]u8 = undefined;
 821            var cv_buffer: [64]u8 = undefined;
 822
 823            if (ctx.view.tryGetSlice(ctx.start_offset, ctx.start_offset + ctx.leaf_len)) |leaf_data| {
 824                const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
 825                Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 826            } else {
 827                ctx.view.copyRange(ctx.start_offset, ctx.start_offset + ctx.leaf_len, leaf_buffer[0..ctx.leaf_len]);
 828                const cv_slice = MultiSliceView.init(leaf_buffer[0..ctx.leaf_len], &[_]u8{}, &[_]u8{});
 829                Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 830            }
 831            @memcpy(ctx.output_cv[0..cv_size], cv_buffer[0..cv_size]);
 832        }
 833    };
 834}
 835
 836fn ktMultiThreaded(
 837    comptime Variant: type,
 838    allocator: Allocator,
 839    io: Io,
 840    view: *const MultiSliceView,
 841    total_len: usize,
 842    output: []u8,
 843) !void {
 844    comptime std.debug.assert(bytes_per_batch % (optimal_vector_len * chunk_size) == 0);
 845
 846    const cv_size = Variant.cv_size;
 847    const StateType = Variant.StateType;
 848    const leaves_per_batch = bytes_per_batch / chunk_size;
 849    const remaining_bytes = total_len - chunk_size;
 850    const total_leaves = std.math.divCeil(usize, remaining_bytes, chunk_size) catch unreachable;
 851
 852    var final_state = StateType.init(.{});
 853
 854    var first_chunk_buffer: [chunk_size]u8 = undefined;
 855    if (view.tryGetSlice(0, chunk_size)) |first_chunk| {
 856        final_state.update(first_chunk);
 857    } else {
 858        view.copyRange(0, chunk_size, &first_chunk_buffer);
 859        final_state.update(&first_chunk_buffer);
 860    }
 861
 862    const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
 863    final_state.update(&padding);
 864
 865    const full_leaves = remaining_bytes / chunk_size;
 866    const has_partial_leaf = (remaining_bytes % chunk_size) != 0;
 867    const partial_leaf_size = if (has_partial_leaf) remaining_bytes % chunk_size else 0;
 868
 869    if (full_leaves > 0) {
 870        const total_batches = std.math.divCeil(usize, full_leaves, leaves_per_batch) catch unreachable;
 871        const max_concurrent: usize = @min(256, total_batches);
 872
 873        const Result = BatchResult(Variant);
 874        const SelectResult = union(enum) { batch: Result };
 875        const Select = Io.Select(SelectResult);
 876
 877        const select_buf = try allocator.alloc(SelectResult, max_concurrent);
 878        defer allocator.free(select_buf);
 879
 880        // Buffer for out-of-order results (select_buf slots get reused)
 881        const pending_cv_buf = try allocator.alloc([leaves_per_batch * cv_size]u8, max_concurrent);
 882        defer allocator.free(pending_cv_buf);
 883        var pending_cv_lens: [256]usize = .{0} ** 256;
 884
 885        var select: Select = .init(io, select_buf);
 886        var batches_spawned: usize = 0;
 887        var next_to_process: usize = 0;
 888
 889        while (next_to_process < total_batches) {
 890            while (batches_spawned < total_batches and batches_spawned - next_to_process < max_concurrent) {
 891                const batch_start_leaf = batches_spawned * leaves_per_batch;
 892                const batch_leaves = @min(leaves_per_batch, full_leaves - batch_start_leaf);
 893                const start_offset = chunk_size + batch_start_leaf * chunk_size;
 894
 895                select.async(.batch, SelectLeafContext(Variant).process, .{SelectLeafContext(Variant){
 896                    .view = view,
 897                    .batch_idx = batches_spawned,
 898                    .start_offset = start_offset,
 899                    .num_leaves = batch_leaves,
 900                }});
 901                batches_spawned += 1;
 902            }
 903
 904            const result = select.wait() catch unreachable;
 905            const batch = result.batch;
 906            const slot = batch.batch_idx % max_concurrent;
 907
 908            if (batch.batch_idx == next_to_process) {
 909                final_state.update(batch.cvs[0..batch.cv_len]);
 910                next_to_process += 1;
 911
 912                // Drain pending batches that are now ready
 913                while (next_to_process < total_batches) {
 914                    const pending_slot = next_to_process % max_concurrent;
 915                    const pending_len = pending_cv_lens[pending_slot];
 916                    if (pending_len == 0) break;
 917
 918                    final_state.update(pending_cv_buf[pending_slot][0..pending_len]);
 919                    pending_cv_lens[pending_slot] = 0;
 920                    next_to_process += 1;
 921                }
 922            } else {
 923                @memcpy(pending_cv_buf[slot][0..batch.cv_len], batch.cvs[0..batch.cv_len]);
 924                pending_cv_lens[slot] = batch.cv_len;
 925            }
 926        }
 927
 928        select.group.wait(io);
 929    }
 930
 931    if (has_partial_leaf) {
 932        var cv_buffer: [64]u8 = undefined;
 933        var leaf_buffer: [chunk_size]u8 = undefined;
 934
 935        const start_offset = chunk_size + full_leaves * chunk_size;
 936        if (view.tryGetSlice(start_offset, start_offset + partial_leaf_size)) |leaf_data| {
 937            const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
 938            Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 939        } else {
 940            view.copyRange(start_offset, start_offset + partial_leaf_size, leaf_buffer[0..partial_leaf_size]);
 941            const cv_slice = MultiSliceView.init(leaf_buffer[0..partial_leaf_size], &[_]u8{}, &[_]u8{});
 942            Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
 943        }
 944        final_state.update(cv_buffer[0..cv_size]);
 945    }
 946
 947    const n_enc = rightEncode(total_leaves);
 948    final_state.update(n_enc.slice());
 949    const terminator = [_]u8{ 0xFF, 0xFF };
 950    final_state.update(&terminator);
 951
 952    final_state.final(output);
 953}
 954
 955/// Generic KangarooTwelve hash function builder.
 956/// Creates a public API type with hash and hashParallel methods for a specific variant.
 957fn KTHash(
 958    comptime Variant: type,
 959    comptime singleChunkFn: fn (*const MultiSliceView, u8, []u8) void,
 960) type {
 961    return struct {
 962        const Self = @This();
 963        const StateType = Variant.StateType;
 964
 965        /// The recommended output length, in bytes.
 966        pub const digest_length = Variant.security_level / 8 * 2;
 967        /// The block length, or rate, in bytes.
 968        pub const block_length = Variant.rate;
 969
 970        /// Configuration options for KangarooTwelve hashing.
 971        ///
 972        /// Options include an optional customization string that provides domain separation,
 973        /// ensuring that identical inputs with different customization strings
 974        /// produce completely distinct hash outputs.
 975        ///
 976        /// This prevents hash collisions when the same data is hashed in different contexts.
 977        ///
 978        /// Customization strings can be of any length.
 979        ///
 980        /// Common options for customization::
 981        ///
 982        /// - Key derivation or MAC: 16-byte secret for KT128, 32-byte secret for KT256
 983        /// - Context Separation: domain-specific strings (e.g., "email", "password", "session")
 984        /// - Composite Keys: concatenation of secret key + context string
 985        pub const Options = struct {
 986            customization: ?[]const u8 = null,
 987        };
 988
 989        // Message buffer (accumulates message data only, not customization)
 990        buffer: [chunk_size]u8,
 991        buffer_len: usize,
 992        message_len: usize,
 993
 994        // Customization string (fixed at init)
 995        customization: []const u8,
 996        custom_len_enc: RightEncoded,
 997
 998        // Tree mode state (lazy initialization when buffer overflows first time)
 999        first_chunk: ?[chunk_size]u8, // Saved first chunk for tree mode
1000        final_state: ?StateType, // Running TurboSHAKE state for final node
1001        num_leaves: usize, // Count of leaves processed (after first chunk)
1002
1003        // SIMD chunk batching
1004        pending_chunks: [8 * chunk_size]u8 align(cache_line_size), // Buffer for up to 8 chunks
1005        pending_count: usize, // Number of complete chunks in pending_chunks
1006
1007        /// Initialize a KangarooTwelve hashing context.
1008        ///
1009        /// Options include an optional customization string that provides domain separation,
1010        /// ensuring that identical inputs with different customization strings
1011        /// produce completely distinct hash outputs.
1012        ///
1013        /// This prevents hash collisions when the same data is hashed in different contexts.
1014        ///
1015        /// Customization strings can be of any length.
1016        ///
1017        /// Common options for customization::
1018        ///
1019        /// - Key derivation or MAC: 16-byte secret for KT128, 32-byte secret for KT256
1020        /// - Context Separation: domain-specific strings (e.g., "email", "password", "session")
1021        /// - Composite Keys: concatenation of secret key + context string
1022        pub fn init(options: Options) Self {
1023            const custom = options.customization orelse &[_]u8{};
1024            return .{
1025                .buffer = undefined,
1026                .buffer_len = 0,
1027                .message_len = 0,
1028                .customization = custom,
1029                .custom_len_enc = rightEncode(custom.len),
1030                .first_chunk = null,
1031                .final_state = null,
1032                .num_leaves = 0,
1033                .pending_chunks = undefined,
1034                .pending_count = 0,
1035            };
1036        }
1037
1038        /// Flush all pending chunks using SIMD when possible
1039        fn flushPendingChunks(self: *Self) void {
1040            const cv_size = Variant.cv_size;
1041
1042            // Process all pending chunks using the largest SIMD batch sizes possible
1043            while (self.pending_count > 0) {
1044                // Try SIMD batches in decreasing size order
1045                inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
1046                    if (optimal_vector_len >= batch_size and self.pending_count >= batch_size) {
1047                        var leaf_cvs: [batch_size * cv_size]u8 align(cache_line_size) = undefined;
1048                        processLeaves(Variant, batch_size, self.pending_chunks[0 .. batch_size * chunk_size], &leaf_cvs);
1049                        self.final_state.?.update(&leaf_cvs);
1050                        self.num_leaves += batch_size;
1051                        self.pending_count -= batch_size;
1052
1053                        // Shift remaining chunks to the front
1054                        if (self.pending_count > 0) {
1055                            const remaining_bytes = self.pending_count * chunk_size;
1056                            @memcpy(self.pending_chunks[0..remaining_bytes], self.pending_chunks[batch_size * chunk_size ..][0..remaining_bytes]);
1057                        }
1058                        break; // Continue outer loop to try next batch
1059                    }
1060                }
1061
1062                // If no SIMD batch was possible, process one chunk with scalar code
1063                if (self.pending_count > 0 and self.pending_count < 2) {
1064                    var cv_buffer: [64]u8 = undefined;
1065                    const cv_slice = MultiSliceView.init(self.pending_chunks[0..chunk_size], &[_]u8{}, &[_]u8{});
1066                    Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1067                    self.final_state.?.update(cv_buffer[0..cv_size]);
1068                    self.num_leaves += 1;
1069                    self.pending_count -= 1;
1070                    break; // No more chunks to process
1071                }
1072            }
1073        }
1074
1075        /// Absorb data into the hash state.
1076        /// Can be called multiple times to incrementally add data.
1077        pub fn update(self: *Self, data: []const u8) void {
1078            if (data.len == 0) return;
1079
1080            var remaining = data;
1081
1082            while (remaining.len > 0) {
1083                const space_in_buffer = chunk_size - self.buffer_len;
1084                const to_copy = @min(space_in_buffer, remaining.len);
1085
1086                // Copy data into buffer
1087                @memcpy(self.buffer[self.buffer_len..][0..to_copy], remaining[0..to_copy]);
1088                self.buffer_len += to_copy;
1089                self.message_len += to_copy;
1090                remaining = remaining[to_copy..];
1091
1092                // If buffer is full, process it
1093                if (self.buffer_len == chunk_size) {
1094                    if (self.first_chunk == null) {
1095                        // First time buffer fills - initialize tree mode
1096                        self.first_chunk = self.buffer;
1097                        self.final_state = StateType.init(.{});
1098
1099                        // Absorb first chunk into final state
1100                        self.final_state.?.update(&self.buffer);
1101
1102                        // Absorb padding (8 bytes: 0x03 followed by 7 zeros)
1103                        const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
1104                        self.final_state.?.update(&padding);
1105                    } else {
1106                        // Add chunk to pending buffer for SIMD batch processing
1107                        @memcpy(self.pending_chunks[self.pending_count * chunk_size ..][0..chunk_size], &self.buffer);
1108                        self.pending_count += 1;
1109
1110                        // Flush when we have enough chunks for optimal SIMD batch
1111                        // Determine best batch size for this architecture
1112                        const optimal_batch_size = comptime blk: {
1113                            if (optimal_vector_len >= 8) break :blk 8;
1114                            if (optimal_vector_len >= 4) break :blk 4;
1115                            if (optimal_vector_len >= 2) break :blk 2;
1116                            break :blk 1;
1117                        };
1118                        if (self.pending_count >= optimal_batch_size) {
1119                            self.flushPendingChunks();
1120                        }
1121                    }
1122                    self.buffer_len = 0;
1123                }
1124            }
1125        }
1126
1127        /// Finalize the hash and produce output.
1128        ///
1129        /// Unlike traditional hash functions, the output can be of any length.
1130        ///
1131        /// When using as a regular hash function, use the recommended `digest_length` value (32 bytes for KT128, 64 bytes for KT256).
1132        ///
1133        /// After calling this method, the context should not be reused. However, the structure can be cloned before finalizing
1134        /// to compute multiple hashes with the same prefix.
1135        pub fn final(self: *Self, out: []u8) void {
1136            const cv_size = Variant.cv_size;
1137
1138            // Calculate total length: message + customization + right_encode(customization.len)
1139            const total_len = self.message_len + self.customization.len + self.custom_len_enc.len;
1140
1141            // Single chunk mode: total data fits in one chunk
1142            if (total_len <= chunk_size) {
1143                // Build the complete input: buffer + customization + encoded length
1144                var single_chunk: [chunk_size]u8 = undefined;
1145                @memcpy(single_chunk[0..self.buffer_len], self.buffer[0..self.buffer_len]);
1146                @memcpy(single_chunk[self.buffer_len..][0..self.customization.len], self.customization);
1147                @memcpy(single_chunk[self.buffer_len + self.customization.len ..][0..self.custom_len_enc.len], self.custom_len_enc.slice());
1148
1149                const view = MultiSliceView.init(single_chunk[0..total_len], &[_]u8{}, &[_]u8{});
1150                singleChunkFn(&view, 0x07, out);
1151                return;
1152            }
1153
1154            // Flush any pending chunks with SIMD
1155            self.flushPendingChunks();
1156
1157            // Build view over remaining data (buffer + customization + encoding)
1158            const remaining_view = MultiSliceView.init(
1159                self.buffer[0..self.buffer_len],
1160                self.customization,
1161                self.custom_len_enc.slice(),
1162            );
1163            const remaining_len = remaining_view.totalLen();
1164
1165            var final_leaves = self.num_leaves;
1166            var leaf_start: usize = 0;
1167
1168            // Tree mode: initialize if not already done (lazy initialization)
1169            if (self.final_state == null and remaining_len > 0) {
1170                self.final_state = StateType.init(.{});
1171
1172                // Absorb first chunk (up to chunk_size bytes from remaining data)
1173                const first_chunk_len = @min(chunk_size, remaining_len);
1174                if (remaining_view.tryGetSlice(0, first_chunk_len)) |first_chunk| {
1175                    // Data is contiguous, use it directly
1176                    self.final_state.?.update(first_chunk);
1177                } else {
1178                    // Data spans boundaries, copy to buffer
1179                    var first_chunk_buf: [chunk_size]u8 = undefined;
1180                    remaining_view.copyRange(0, first_chunk_len, first_chunk_buf[0..first_chunk_len]);
1181                    self.final_state.?.update(first_chunk_buf[0..first_chunk_len]);
1182                }
1183
1184                // Absorb padding (8 bytes: 0x03 followed by 7 zeros)
1185                const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
1186                self.final_state.?.update(&padding);
1187
1188                // Process remaining data as leaves
1189                leaf_start = first_chunk_len;
1190            }
1191
1192            // Process all remaining data as leaves (starting from leaf_start)
1193            var offset = leaf_start;
1194            while (offset < remaining_len) {
1195                const leaf_end = @min(offset + chunk_size, remaining_len);
1196                const leaf_size = leaf_end - offset;
1197
1198                var cv_buffer: [64]u8 = undefined;
1199                if (remaining_view.tryGetSlice(offset, leaf_end)) |leaf_data| {
1200                    // Data is contiguous, use it directly
1201                    const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
1202                    Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1203                } else {
1204                    // Data spans boundaries, copy to buffer
1205                    var leaf_buf: [chunk_size]u8 = undefined;
1206                    remaining_view.copyRange(offset, leaf_end, leaf_buf[0..leaf_size]);
1207                    const cv_slice = MultiSliceView.init(leaf_buf[0..leaf_size], &[_]u8{}, &[_]u8{});
1208                    Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1209                }
1210                self.final_state.?.update(cv_buffer[0..cv_size]);
1211                final_leaves += 1;
1212                offset = leaf_end;
1213            }
1214
1215            // Absorb right_encode(num_leaves) and terminator
1216            const n_enc = rightEncode(final_leaves);
1217            self.final_state.?.update(n_enc.slice());
1218            const terminator = [_]u8{ 0xFF, 0xFF };
1219            self.final_state.?.update(&terminator);
1220
1221            // Squeeze output
1222            self.final_state.?.final(out);
1223        }
1224
1225        /// Hash a message using sequential processing with SIMD acceleration.
1226        ///
1227        /// Parameters:
1228        ///   - message: Input data to hash (any length)
1229        ///   - out: Output buffer (any length, arbitrary output sizes supported, `digest_length` recommended for standard use)
1230        ///   - options: Optional settings to include a secret key or a context separation string
1231        pub fn hash(message: []const u8, out: []u8, options: Options) !void {
1232            const custom = options.customization orelse &[_]u8{};
1233
1234            // Right-encode customization length
1235            const custom_len_enc = rightEncode(custom.len);
1236
1237            // Create zero-copy multi-slice view (no concatenation)
1238            const view = MultiSliceView.init(message, custom, custom_len_enc.slice());
1239            const total_len = view.totalLen();
1240
1241            // Single chunk case - zero-copy absorption!
1242            if (total_len <= chunk_size) {
1243                singleChunkFn(&view, 0x07, out);
1244                return;
1245            }
1246
1247            // Tree mode - single-threaded SIMD processing
1248            ktSingleThreaded(Variant, &view, total_len, out);
1249        }
1250
1251        /// Hash with automatic parallelization for large inputs (>2MB).
1252        /// Automatically uses sequential processing for smaller inputs to avoid thread overhead.
1253        /// Allocator required for temporary buffers. IO object required for thread management.
1254        pub fn hashParallel(message: []const u8, out: []u8, options: Options, allocator: Allocator, io: Io) !void {
1255            const custom = options.customization orelse &[_]u8{};
1256
1257            const custom_len_enc = rightEncode(custom.len);
1258            const view = MultiSliceView.init(message, custom, custom_len_enc.slice());
1259            const total_len = view.totalLen();
1260
1261            // Single chunk case
1262            if (total_len <= chunk_size) {
1263                singleChunkFn(&view, 0x07, out);
1264                return;
1265            }
1266
1267            // Use single-threaded processing if below threshold
1268            if (total_len < large_file_threshold) {
1269                ktSingleThreaded(Variant, &view, total_len, out);
1270                return;
1271            }
1272
1273            // Tree mode - multi-threaded processing
1274            try ktMultiThreaded(Variant, allocator, io, &view, total_len, out);
1275        }
1276    };
1277}
1278
1279/// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing
1280/// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive
1281/// underlying SHA-3, which has undergone over 15 years of intensive cryptanalysis
1282/// since the SHA-3 competition (2008-2012) and remains secure.
1283///
1284/// K12 uses Keccak-p[1600,12] with 12 rounds (half of SHA-3's 24 rounds), providing
1285/// 128-bit security strength equivalent to AES-128 and SHAKE128. While this offers
1286/// less conservative margin than SHA-3, current cryptanalysis reaches only 6 rounds,
1287/// leaving a substantial security margin. This deliberate trade-off delivers
1288/// significantly better performance while maintaining strong practical security.
1289///
1290/// Standardized as RFC 9861 after 8 years of public scrutiny. Supports arbitrary-length
1291/// output and optional customization strings for domain separation.
1292pub const KT128 = KTHash(KT128Variant, turboShake128MultiSliceToBuffer);
1293
1294/// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing
1295/// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive
1296/// underlying SHA-3, which has undergone over 15 years of intensive cryptanalysis
1297/// since the SHA-3 competition (2008-2012) and remains secure.
1298///
1299/// KT256 provides 256-bit security strength and achieves NIST post-quantum security
1300/// level 2 when using at least 256-bit outputs. Like KT128, it uses Keccak-p[1600,12]
1301/// with 12 rounds, offering a deliberate trade-off between conservative margin and
1302/// significantly better performance while maintaining strong practical security.
1303///
1304/// Use KT256 when you need extra conservative margins.
1305/// For most applications, KT128 offers better performance with adequate security.
1306pub const KT256 = KTHash(KT256Variant, turboShake256MultiSliceToBuffer);
1307
1308test "KT128 sequential and parallel produce same output for small inputs" {
1309    const allocator = std.testing.allocator;
1310    const io = std.testing.io;
1311
1312    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1313    const random = prng.random();
1314
1315    // Test with different small input sizes
1316    const test_sizes = [_]usize{ 100, 1024, 4096, 8192 }; // 100B, 1KB, 4KB, 8KB
1317
1318    for (test_sizes) |size| {
1319        const input = try allocator.alloc(u8, size);
1320        defer allocator.free(input);
1321
1322        // Fill with random data
1323        random.bytes(input);
1324
1325        var output_seq: [32]u8 = undefined;
1326        var output_par: [32]u8 = undefined;
1327
1328        // Hash with sequential method
1329        try KT128.hash(input, &output_seq, .{});
1330
1331        // Hash with parallel method
1332        try KT128.hashParallel(input, &output_par, .{}, allocator, io);
1333
1334        // Verify outputs match
1335        try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1336    }
1337}
1338
1339test "KT128 sequential and parallel produce same output for large inputs" {
1340    const allocator = std.testing.allocator;
1341    const io = std.testing.io;
1342
1343    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1344    const random = prng.random();
1345
1346    // Test with input sizes above the 2MB threshold to trigger parallel processing.
1347    // Include a size with partial final leaf to stress boundary handling.
1348    const test_sizes = [_]usize{
1349        5 * 512 * 1024, // 2.5 MB
1350        5 * 512 * 1024 + 8191, // 2.5 MB + 8191B (partial leaf)
1351    };
1352
1353    for (test_sizes) |size| {
1354        const input = try allocator.alloc(u8, size);
1355        defer allocator.free(input);
1356
1357        // Fill with random data
1358        random.bytes(input);
1359
1360        var output_seq: [64]u8 = undefined;
1361        var output_par: [64]u8 = undefined;
1362
1363        // Hash with sequential method
1364        try KT128.hash(input, &output_seq, .{});
1365
1366        // Hash with parallel method
1367        try KT128.hashParallel(input, &output_par, .{}, allocator, io);
1368
1369        // Verify outputs match
1370        try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1371    }
1372}
1373
1374test "KT128 sequential and parallel produce same output for many random lengths" {
1375    const allocator = std.testing.allocator;
1376    const io = std.testing.io;
1377
1378    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1379    const random = prng.random();
1380
1381    const num_tests = if (builtin.mode == .Debug) 10 else 1000;
1382    const max_length = 250000;
1383
1384    for (0..num_tests) |_| {
1385        const length = random.intRangeAtMost(usize, 0, max_length);
1386
1387        const input = try allocator.alloc(u8, length);
1388        defer allocator.free(input);
1389
1390        random.bytes(input);
1391
1392        var output_seq: [32]u8 = undefined;
1393        var output_par: [32]u8 = undefined;
1394
1395        try KT128.hash(input, &output_seq, .{});
1396        try KT128.hashParallel(input, &output_par, .{}, allocator, io);
1397
1398        try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1399    }
1400}
1401
1402test "KT128 sequential and parallel produce same output with customization" {
1403    const allocator = std.testing.allocator;
1404    const io = std.testing.io;
1405
1406    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1407    const random = prng.random();
1408
1409    const input_size = 5 * 512 * 1024; // 2.5MB
1410    const input = try allocator.alloc(u8, input_size);
1411    defer allocator.free(input);
1412
1413    // Fill with random data
1414    random.bytes(input);
1415
1416    const customization = "test domain";
1417    var output_seq: [48]u8 = undefined;
1418    var output_par: [48]u8 = undefined;
1419
1420    // Hash with sequential method
1421    try KT128.hash(input, &output_seq, .{ .customization = customization });
1422
1423    // Hash with parallel method
1424    try KT128.hashParallel(input, &output_par, .{ .customization = customization }, allocator, io);
1425
1426    // Verify outputs match
1427    try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1428}
1429
1430test "KT256 sequential and parallel produce same output for small inputs" {
1431    const allocator = std.testing.allocator;
1432    const io = std.testing.io;
1433
1434    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1435    const random = prng.random();
1436
1437    // Test with different small input sizes
1438    const test_sizes = [_]usize{ 100, 1024, 4096, 8192 }; // 100B, 1KB, 4KB, 8KB
1439
1440    for (test_sizes) |size| {
1441        const input = try allocator.alloc(u8, size);
1442        defer allocator.free(input);
1443
1444        // Fill with random data
1445        random.bytes(input);
1446
1447        var output_seq: [64]u8 = undefined;
1448        var output_par: [64]u8 = undefined;
1449
1450        // Hash with sequential method
1451        try KT256.hash(input, &output_seq, .{});
1452
1453        // Hash with parallel method
1454        try KT256.hashParallel(input, &output_par, .{}, allocator, io);
1455
1456        // Verify outputs match
1457        try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1458    }
1459}
1460
1461test "KT256 sequential and parallel produce same output for large inputs" {
1462    const allocator = std.testing.allocator;
1463    const io = std.testing.io;
1464
1465    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1466    const random = prng.random();
1467
1468    // Test with input sizes above the 2MB threshold to trigger parallel processing.
1469    // Include a size with partial final leaf to stress boundary handling.
1470    const test_sizes = [_]usize{
1471        5 * 512 * 1024, // 2.5 MB
1472        5 * 512 * 1024 + 8191, // 2.5 MB + 8191B (partial leaf)
1473    };
1474
1475    for (test_sizes) |size| {
1476        const input = try allocator.alloc(u8, size);
1477        defer allocator.free(input);
1478
1479        // Fill with random data
1480        random.bytes(input);
1481
1482        var output_seq: [64]u8 = undefined;
1483        var output_par: [64]u8 = undefined;
1484
1485        // Hash with sequential method
1486        try KT256.hash(input, &output_seq, .{});
1487
1488        // Hash with parallel method
1489        try KT256.hashParallel(input, &output_par, .{}, allocator, io);
1490
1491        // Verify outputs match
1492        try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1493    }
1494}
1495
1496test "KT256 sequential and parallel produce same output with customization" {
1497    const allocator = std.testing.allocator;
1498    const io = std.testing.io;
1499
1500    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1501    const random = prng.random();
1502
1503    const input_size = 5 * 512 * 1024; // 2.5MB
1504    const input = try allocator.alloc(u8, input_size);
1505    defer allocator.free(input);
1506
1507    // Fill with random data
1508    random.bytes(input);
1509
1510    const customization = "test domain";
1511    var output_seq: [80]u8 = undefined;
1512    var output_par: [80]u8 = undefined;
1513
1514    // Hash with sequential method
1515    try KT256.hash(input, &output_seq, .{ .customization = customization });
1516
1517    // Hash with parallel method
1518    try KT256.hashParallel(input, &output_par, .{ .customization = customization }, allocator, io);
1519
1520    // Verify outputs match
1521    try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1522}
1523
1524/// Helper: Generate pattern data where data[i] = (i % 251)
1525fn generatePattern(allocator: Allocator, len: usize) ![]u8 {
1526    const data = try allocator.alloc(u8, len);
1527    for (data, 0..) |*byte, i| {
1528        byte.* = @intCast(i % 251);
1529    }
1530    return data;
1531}
1532
1533test "KT128: empty message, empty customization, 32 bytes" {
1534    var output: [32]u8 = undefined;
1535    try KT128.hash(&[_]u8{}, &output, .{});
1536
1537    var expected: [32]u8 = undefined;
1538    _ = try std.fmt.hexToBytes(&expected, "1AC2D450FC3B4205D19DA7BFCA1B37513C0803577AC7167F06FE2CE1F0EF39E5");
1539    try std.testing.expectEqualSlices(u8, &expected, &output);
1540}
1541
1542test "KT128: empty message, empty customization, 64 bytes" {
1543    var output: [64]u8 = undefined;
1544    try KT128.hash(&[_]u8{}, &output, .{});
1545
1546    var expected: [64]u8 = undefined;
1547    _ = try std.fmt.hexToBytes(&expected, "1AC2D450FC3B4205D19DA7BFCA1B37513C0803577AC7167F06FE2CE1F0EF39E54269C056B8C82E48276038B6D292966CC07A3D4645272E31FF38508139EB0A71");
1548    try std.testing.expectEqualSlices(u8, &expected, &output);
1549}
1550
1551test "KT128: empty message, empty customization, 10032 bytes (last 32)" {
1552    const allocator = std.testing.allocator;
1553    const output = try allocator.alloc(u8, 10032);
1554    defer allocator.free(output);
1555
1556    try KT128.hash(&[_]u8{}, output, .{});
1557
1558    var expected: [32]u8 = undefined;
1559    _ = try std.fmt.hexToBytes(&expected, "E8DC563642F7228C84684C898405D3A834799158C079B12880277A1D28E2FF6D");
1560    try std.testing.expectEqualSlices(u8, &expected, output[10000..]);
1561}
1562
1563test "KT128: pattern message (1 byte), empty customization, 32 bytes" {
1564    const allocator = std.testing.allocator;
1565    const message = try generatePattern(allocator, 1);
1566    defer allocator.free(message);
1567
1568    var output: [32]u8 = undefined;
1569    try KT128.hash(message, &output, .{});
1570
1571    var expected: [32]u8 = undefined;
1572    _ = try std.fmt.hexToBytes(&expected, "2BDA92450E8B147F8A7CB629E784A058EFCA7CF7D8218E02D345DFAA65244A1F");
1573    try std.testing.expectEqualSlices(u8, &expected, &output);
1574}
1575
1576test "KT128: pattern message (17 bytes), empty customization, 32 bytes" {
1577    const allocator = std.testing.allocator;
1578    const message = try generatePattern(allocator, 17);
1579    defer allocator.free(message);
1580
1581    var output: [32]u8 = undefined;
1582    try KT128.hash(message, &output, .{});
1583
1584    var expected: [32]u8 = undefined;
1585    _ = try std.fmt.hexToBytes(&expected, "6BF75FA2239198DB4772E36478F8E19B0F371205F6A9A93A273F51DF37122888");
1586    try std.testing.expectEqualSlices(u8, &expected, &output);
1587}
1588
1589test "KT128: pattern message (289 bytes), empty customization, 32 bytes" {
1590    const allocator = std.testing.allocator;
1591    const message = try generatePattern(allocator, 289);
1592    defer allocator.free(message);
1593
1594    var output: [32]u8 = undefined;
1595    try KT128.hash(message, &output, .{});
1596
1597    var expected: [32]u8 = undefined;
1598    _ = try std.fmt.hexToBytes(&expected, "0C315EBCDEDBF61426DE7DCF8FB725D1E74675D7F5327A5067F367B108ECB67C");
1599    try std.testing.expectEqualSlices(u8, &expected, &output);
1600}
1601
1602test "KT128: 0xFF message (1 byte), pattern customization (1 byte), 32 bytes" {
1603    const allocator = std.testing.allocator;
1604    const customization = try generatePattern(allocator, 1);
1605    defer allocator.free(customization);
1606
1607    const message = [_]u8{0xFF};
1608    var output: [32]u8 = undefined;
1609    try KT128.hash(&message, &output, .{ .customization = customization });
1610
1611    var expected: [32]u8 = undefined;
1612    _ = try std.fmt.hexToBytes(&expected, "A20B92B251E3D62443EC286E4B9B470A4E8315C156EEB24878B038ABE20650BE");
1613    try std.testing.expectEqualSlices(u8, &expected, &output);
1614}
1615
1616test "KT128: pattern message (8191 bytes), empty customization, 32 bytes" {
1617    const allocator = std.testing.allocator;
1618    const message = try generatePattern(allocator, 8191);
1619    defer allocator.free(message);
1620
1621    var output: [32]u8 = undefined;
1622    try KT128.hash(message, &output, .{});
1623
1624    var expected: [32]u8 = undefined;
1625    _ = try std.fmt.hexToBytes(&expected, "1B577636F723643E990CC7D6A659837436FD6A103626600EB8301CD1DBE553D6");
1626    try std.testing.expectEqualSlices(u8, &expected, &output);
1627}
1628
1629test "KT128: pattern message (8192 bytes), empty customization, 32 bytes" {
1630    const allocator = std.testing.allocator;
1631    const message = try generatePattern(allocator, 8192);
1632    defer allocator.free(message);
1633
1634    var output: [32]u8 = undefined;
1635    try KT128.hash(message, &output, .{});
1636
1637    var expected: [32]u8 = undefined;
1638    _ = try std.fmt.hexToBytes(&expected, "48F256F6772F9EDFB6A8B661EC92DC93B95EBD05A08A17B39AE3490870C926C3");
1639    try std.testing.expectEqualSlices(u8, &expected, &output);
1640}
1641
1642test "KT256: empty message, empty customization, 64 bytes" {
1643    var output: [64]u8 = undefined;
1644    try KT256.hash(&[_]u8{}, &output, .{});
1645
1646    var expected: [64]u8 = undefined;
1647    _ = try std.fmt.hexToBytes(&expected, "B23D2E9CEA9F4904E02BEC06817FC10CE38CE8E93EF4C89E6537076AF8646404E3E8B68107B8833A5D30490AA33482353FD4ADC7148ECB782855003AAEBDE4A9");
1648    try std.testing.expectEqualSlices(u8, &expected, &output);
1649}
1650
1651test "KT256: empty message, empty customization, 128 bytes" {
1652    var output: [128]u8 = undefined;
1653    try KT256.hash(&[_]u8{}, &output, .{});
1654
1655    var expected: [128]u8 = undefined;
1656    _ = try std.fmt.hexToBytes(&expected, "B23D2E9CEA9F4904E02BEC06817FC10CE38CE8E93EF4C89E6537076AF8646404E3E8B68107B8833A5D30490AA33482353FD4ADC7148ECB782855003AAEBDE4A9B0925319D8EA1E121A609821EC19EFEA89E6D08DAEE1662B69C840289F188BA860F55760B61F82114C030C97E5178449608CCD2CD2D919FC7829FF69931AC4D0");
1657    try std.testing.expectEqualSlices(u8, &expected, &output);
1658}
1659
1660test "KT256: pattern message (1 byte), empty customization, 64 bytes" {
1661    const allocator = std.testing.allocator;
1662    const message = try generatePattern(allocator, 1);
1663    defer allocator.free(message);
1664
1665    var output: [64]u8 = undefined;
1666    try KT256.hash(message, &output, .{});
1667
1668    var expected: [64]u8 = undefined;
1669    _ = try std.fmt.hexToBytes(&expected, "0D005A194085360217128CF17F91E1F71314EFA5564539D444912E3437EFA17F82DB6F6FFE76E781EAA068BCE01F2BBF81EACB983D7230F2FB02834A21B1DDD0");
1670    try std.testing.expectEqualSlices(u8, &expected, &output);
1671}
1672
1673test "KT256: pattern message (17 bytes), empty customization, 64 bytes" {
1674    const allocator = std.testing.allocator;
1675    const message = try generatePattern(allocator, 17);
1676    defer allocator.free(message);
1677
1678    var output: [64]u8 = undefined;
1679    try KT256.hash(message, &output, .{});
1680
1681    var expected: [64]u8 = undefined;
1682    _ = try std.fmt.hexToBytes(&expected, "1BA3C02B1FC514474F06C8979978A9056C8483F4A1B63D0DCCEFE3A28A2F323E1CDCCA40EBF006AC76EF0397152346837B1277D3E7FAA9C9653B19075098527B");
1683    try std.testing.expectEqualSlices(u8, &expected, &output);
1684}
1685
1686test "KT256: pattern message (8191 bytes), empty customization, 64 bytes" {
1687    const allocator = std.testing.allocator;
1688    const message = try generatePattern(allocator, 8191);
1689    defer allocator.free(message);
1690
1691    var output: [64]u8 = undefined;
1692    try KT256.hash(message, &output, .{});
1693
1694    var expected: [64]u8 = undefined;
1695    _ = try std.fmt.hexToBytes(&expected, "3081434D93A4108D8D8A3305B89682CEBEDC7CA4EA8A3CE869FBB73CBE4A58EEF6F24DE38FFC170514C70E7AB2D01F03812616E863D769AFB3753193BA045B20");
1696    try std.testing.expectEqualSlices(u8, &expected, &output);
1697}
1698
1699test "KT256: pattern message (8192 bytes), empty customization, 64 bytes" {
1700    const allocator = std.testing.allocator;
1701    const message = try generatePattern(allocator, 8192);
1702    defer allocator.free(message);
1703
1704    var output: [64]u8 = undefined;
1705    try KT256.hash(message, &output, .{});
1706
1707    var expected: [64]u8 = undefined;
1708    _ = try std.fmt.hexToBytes(&expected, "C6EE8E2AD3200C018AC87AAA031CDAC22121B412D07DC6E0DCCBB53423747E9A1C18834D99DF596CF0CF4B8DFAFB7BF02D139D0C9035725ADC1A01B7230A41FA");
1709    try std.testing.expectEqualSlices(u8, &expected, &output);
1710}
1711
1712test "KT128: pattern message (8193 bytes), empty customization, 32 bytes" {
1713    const allocator = std.testing.allocator;
1714    const message = try generatePattern(allocator, 8193);
1715    defer allocator.free(message);
1716
1717    var output: [32]u8 = undefined;
1718    try KT128.hash(message, &output, .{});
1719
1720    var expected: [32]u8 = undefined;
1721    _ = try std.fmt.hexToBytes(&expected, "BB66FE72EAEA5179418D5295EE1344854D8AD7F3FA17EFCB467EC152341284CF");
1722    try std.testing.expectEqualSlices(u8, &expected, &output);
1723}
1724
1725test "KT128: pattern message (16384 bytes), empty customization, 32 bytes" {
1726    const allocator = std.testing.allocator;
1727    const message = try generatePattern(allocator, 16384);
1728    defer allocator.free(message);
1729
1730    var output: [32]u8 = undefined;
1731    try KT128.hash(message, &output, .{});
1732
1733    var expected: [32]u8 = undefined;
1734    _ = try std.fmt.hexToBytes(&expected, "82778F7F7234C83352E76837B721FBDBB5270B88010D84FA5AB0B61EC8CE0956");
1735    try std.testing.expectEqualSlices(u8, &expected, &output);
1736}
1737
1738test "KT128: pattern message (16385 bytes), empty customization, 32 bytes" {
1739    const allocator = std.testing.allocator;
1740    const message = try generatePattern(allocator, 16385);
1741    defer allocator.free(message);
1742
1743    var output: [32]u8 = undefined;
1744    try KT128.hash(message, &output, .{});
1745
1746    var expected: [32]u8 = undefined;
1747    _ = try std.fmt.hexToBytes(&expected, "5F8D2B943922B451842B4E82740D02369E2D5F9F33C5123509A53B955FE177B2");
1748    try std.testing.expectEqualSlices(u8, &expected, &output);
1749}
1750
1751test "KT256: pattern message (8193 bytes), empty customization, 64 bytes" {
1752    const allocator = std.testing.allocator;
1753    const message = try generatePattern(allocator, 8193);
1754    defer allocator.free(message);
1755
1756    var output: [64]u8 = undefined;
1757    try KT256.hash(message, &output, .{});
1758
1759    var expected: [64]u8 = undefined;
1760    _ = try std.fmt.hexToBytes(&expected, "65FF03335900E5197ACBD5F41B797F0E7E36AD4FF7D89C09FA6F28AE58D1E8BC2DF1779B86F988C3B13690172914EA172423B23EF4057255BB0836AB3A99836E");
1761    try std.testing.expectEqualSlices(u8, &expected, &output);
1762}
1763
1764test "KT256: pattern message (16384 bytes), empty customization, 64 bytes" {
1765    const allocator = std.testing.allocator;
1766    const message = try generatePattern(allocator, 16384);
1767    defer allocator.free(message);
1768
1769    var output: [64]u8 = undefined;
1770    try KT256.hash(message, &output, .{});
1771
1772    var expected: [64]u8 = undefined;
1773    _ = try std.fmt.hexToBytes(&expected, "74604239A14847CB79069B4FF0E51070A93034C9AC4DFF4D45E0F2C5DA81D930DE6055C2134B4DF4E49F27D1B2C66E95491858B182A924BD0504DA5976BC516D");
1774    try std.testing.expectEqualSlices(u8, &expected, &output);
1775}
1776
1777test "KT256: pattern message (16385 bytes), empty customization, 64 bytes" {
1778    const allocator = std.testing.allocator;
1779    const message = try generatePattern(allocator, 16385);
1780    defer allocator.free(message);
1781
1782    var output: [64]u8 = undefined;
1783    try KT256.hash(message, &output, .{});
1784
1785    var expected: [64]u8 = undefined;
1786    _ = try std.fmt.hexToBytes(&expected, "C814F23132DADBFD55379F18CB988CB39B751F119322823FD982644A897485397B9F40EB11C6E416359B8AE695A5CE0FA79D1ADA1EEC745D82E0A5AB08A9F014");
1787    try std.testing.expectEqualSlices(u8, &expected, &output);
1788}
1789
1790test "KT128 incremental: empty message matches one-shot" {
1791    var output_oneshot: [32]u8 = undefined;
1792    var output_incremental: [32]u8 = undefined;
1793
1794    try KT128.hash(&[_]u8{}, &output_oneshot, .{});
1795
1796    var hasher = KT128.init(.{});
1797    hasher.final(&output_incremental);
1798
1799    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1800}
1801
1802test "KT128 incremental: small message matches one-shot" {
1803    const message = "Hello, KangarooTwelve!";
1804
1805    var output_oneshot: [32]u8 = undefined;
1806    var output_incremental: [32]u8 = undefined;
1807
1808    try KT128.hash(message, &output_oneshot, .{});
1809
1810    var hasher = KT128.init(.{});
1811    hasher.update(message);
1812    hasher.final(&output_incremental);
1813
1814    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1815}
1816
1817test "KT128 incremental: multiple updates match single update" {
1818    const part1 = "Hello, ";
1819    const part2 = "Kangaroo";
1820    const part3 = "Twelve!";
1821
1822    var output_single: [32]u8 = undefined;
1823    var output_multi: [32]u8 = undefined;
1824
1825    // Single update
1826    var hasher1 = KT128.init(.{});
1827    hasher1.update(part1 ++ part2 ++ part3);
1828    hasher1.final(&output_single);
1829
1830    // Multiple updates
1831    var hasher2 = KT128.init(.{});
1832    hasher2.update(part1);
1833    hasher2.update(part2);
1834    hasher2.update(part3);
1835    hasher2.final(&output_multi);
1836
1837    try std.testing.expectEqualSlices(u8, &output_single, &output_multi);
1838}
1839
1840test "KT128 incremental: exactly chunk_size matches one-shot" {
1841    const allocator = std.testing.allocator;
1842    const message = try allocator.alloc(u8, 8192);
1843    defer allocator.free(message);
1844    @memset(message, 0xAB);
1845
1846    var output_oneshot: [32]u8 = undefined;
1847    var output_incremental: [32]u8 = undefined;
1848
1849    try KT128.hash(message, &output_oneshot, .{});
1850
1851    var hasher = KT128.init(.{});
1852    hasher.update(message);
1853    hasher.final(&output_incremental);
1854
1855    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1856}
1857
1858test "KT128 incremental: larger than chunk_size matches one-shot" {
1859    const allocator = std.testing.allocator;
1860    const message = try generatePattern(allocator, 16384);
1861    defer allocator.free(message);
1862
1863    var output_oneshot: [32]u8 = undefined;
1864    var output_incremental: [32]u8 = undefined;
1865
1866    try KT128.hash(message, &output_oneshot, .{});
1867
1868    var hasher = KT128.init(.{});
1869    hasher.update(message);
1870    hasher.final(&output_incremental);
1871
1872    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1873}
1874
1875test "KT128 incremental: with customization matches one-shot" {
1876    const message = "Test message";
1877    const customization = "my custom domain";
1878
1879    var output_oneshot: [32]u8 = undefined;
1880    var output_incremental: [32]u8 = undefined;
1881
1882    try KT128.hash(message, &output_oneshot, .{ .customization = customization });
1883
1884    var hasher = KT128.init(.{ .customization = customization });
1885    hasher.update(message);
1886    hasher.final(&output_incremental);
1887
1888    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1889}
1890
1891test "KT128 incremental: large message with customization" {
1892    const allocator = std.testing.allocator;
1893    const message = try generatePattern(allocator, 20000);
1894    defer allocator.free(message);
1895    const customization = "test domain";
1896
1897    var output_oneshot: [48]u8 = undefined;
1898    var output_incremental: [48]u8 = undefined;
1899
1900    try KT128.hash(message, &output_oneshot, .{ .customization = customization });
1901
1902    var hasher = KT128.init(.{ .customization = customization });
1903    hasher.update(message);
1904    hasher.final(&output_incremental);
1905
1906    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1907}
1908
1909test "KT128 incremental: streaming chunks matches one-shot" {
1910    const allocator = std.testing.allocator;
1911    const message = try generatePattern(allocator, 25000);
1912    defer allocator.free(message);
1913
1914    var output_oneshot: [32]u8 = undefined;
1915    var output_incremental: [32]u8 = undefined;
1916
1917    try KT128.hash(message, &output_oneshot, .{});
1918
1919    var hasher = KT128.init(.{});
1920
1921    // Feed in 1KB chunks
1922    var offset: usize = 0;
1923    while (offset < message.len) {
1924        const chunk_size_local = @min(1024, message.len - offset);
1925        hasher.update(message[offset..][0..chunk_size_local]);
1926        offset += chunk_size_local;
1927    }
1928    hasher.final(&output_incremental);
1929
1930    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1931}
1932
1933test "KT256 incremental: empty message matches one-shot" {
1934    var output_oneshot: [64]u8 = undefined;
1935    var output_incremental: [64]u8 = undefined;
1936
1937    try KT256.hash(&[_]u8{}, &output_oneshot, .{});
1938
1939    var hasher = KT256.init(.{});
1940    hasher.final(&output_incremental);
1941
1942    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1943}
1944
1945test "KT256 incremental: small message matches one-shot" {
1946    const message = "Hello, KangarooTwelve with 256-bit security!";
1947
1948    var output_oneshot: [64]u8 = undefined;
1949    var output_incremental: [64]u8 = undefined;
1950
1951    try KT256.hash(message, &output_oneshot, .{});
1952
1953    var hasher = KT256.init(.{});
1954    hasher.update(message);
1955    hasher.final(&output_incremental);
1956
1957    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1958}
1959
1960test "KT256 incremental: large message matches one-shot" {
1961    const allocator = std.testing.allocator;
1962    const message = try generatePattern(allocator, 30000);
1963    defer allocator.free(message);
1964
1965    var output_oneshot: [64]u8 = undefined;
1966    var output_incremental: [64]u8 = undefined;
1967
1968    try KT256.hash(message, &output_oneshot, .{});
1969
1970    var hasher = KT256.init(.{});
1971    hasher.update(message);
1972    hasher.final(&output_incremental);
1973
1974    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1975}
1976
1977test "KT256 incremental: with customization matches one-shot" {
1978    const allocator = std.testing.allocator;
1979    const message = try generatePattern(allocator, 15000);
1980    defer allocator.free(message);
1981    const customization = "KT256 custom domain";
1982
1983    var output_oneshot: [80]u8 = undefined;
1984    var output_incremental: [80]u8 = undefined;
1985
1986    try KT256.hash(message, &output_oneshot, .{ .customization = customization });
1987
1988    var hasher = KT256.init(.{ .customization = customization });
1989    hasher.update(message);
1990    hasher.final(&output_incremental);
1991
1992    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1993}
1994
1995test "KT128 incremental: random small message with random chunk sizes" {
1996    const allocator = std.testing.allocator;
1997
1998    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1999    const random = prng.random();
2000
2001    const test_sizes = [_]usize{ 100, 500, 2000, 5000, 10000 };
2002
2003    for (test_sizes) |total_size| {
2004        const message = try allocator.alloc(u8, total_size);
2005        defer allocator.free(message);
2006        random.bytes(message);
2007
2008        var output_oneshot: [32]u8 = undefined;
2009        var output_incremental: [32]u8 = undefined;
2010
2011        try KT128.hash(message, &output_oneshot, .{});
2012
2013        var hasher = KT128.init(.{});
2014        var offset: usize = 0;
2015
2016        while (offset < message.len) {
2017            const remaining = message.len - offset;
2018            const max_chunk = @min(1000, remaining);
2019            const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2020
2021            hasher.update(message[offset..][0..chunk_size_local]);
2022            offset += chunk_size_local;
2023        }
2024        hasher.final(&output_incremental);
2025
2026        try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2027    }
2028}
2029
2030test "KT128 incremental: random large message (1MB) with random chunk sizes" {
2031    const allocator = std.testing.allocator;
2032
2033    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2034    const random = prng.random();
2035
2036    const total_size: usize = 1024 * 1024; // 1 MB
2037    const message = try allocator.alloc(u8, total_size);
2038    defer allocator.free(message);
2039    random.bytes(message);
2040
2041    var output_oneshot: [32]u8 = undefined;
2042    var output_incremental: [32]u8 = undefined;
2043
2044    try KT128.hash(message, &output_oneshot, .{});
2045
2046    var hasher = KT128.init(.{});
2047    var offset: usize = 0;
2048
2049    while (offset < message.len) {
2050        const remaining = message.len - offset;
2051        const max_chunk = @min(10000, remaining);
2052        const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2053
2054        hasher.update(message[offset..][0..chunk_size_local]);
2055        offset += chunk_size_local;
2056    }
2057    hasher.final(&output_incremental);
2058
2059    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2060}
2061
2062test "KT256 incremental: random small message with random chunk sizes" {
2063    const allocator = std.testing.allocator;
2064
2065    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2066    const random = prng.random();
2067
2068    const test_sizes = [_]usize{ 100, 500, 2000, 5000, 10000 };
2069
2070    for (test_sizes) |total_size| {
2071        // Generate random message
2072        const message = try allocator.alloc(u8, total_size);
2073        defer allocator.free(message);
2074        random.bytes(message);
2075
2076        var output_oneshot: [64]u8 = undefined;
2077        var output_incremental: [64]u8 = undefined;
2078
2079        try KT256.hash(message, &output_oneshot, .{});
2080
2081        var hasher = KT256.init(.{});
2082        var offset: usize = 0;
2083
2084        while (offset < message.len) {
2085            const remaining = message.len - offset;
2086            const max_chunk = @min(1000, remaining);
2087            const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2088
2089            hasher.update(message[offset..][0..chunk_size_local]);
2090            offset += chunk_size_local;
2091        }
2092        hasher.final(&output_incremental);
2093
2094        try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2095    }
2096}
2097
2098test "KT256 incremental: random large message (1MB) with random chunk sizes" {
2099    const allocator = std.testing.allocator;
2100
2101    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2102    const random = prng.random();
2103
2104    const total_size: usize = 1024 * 1024; // 1 MB
2105    const message = try allocator.alloc(u8, total_size);
2106    defer allocator.free(message);
2107    random.bytes(message);
2108
2109    var output_oneshot: [64]u8 = undefined;
2110    var output_incremental: [64]u8 = undefined;
2111
2112    try KT256.hash(message, &output_oneshot, .{});
2113
2114    var hasher = KT256.init(.{});
2115    var offset: usize = 0;
2116
2117    while (offset < message.len) {
2118        const remaining = message.len - offset;
2119        const max_chunk = @min(10000, remaining);
2120        const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2121
2122        hasher.update(message[offset..][0..chunk_size_local]);
2123        offset += chunk_size_local;
2124    }
2125    hasher.final(&output_incremental);
2126
2127    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2128}
2129
2130test "KT128 incremental: random message with customization and random chunks" {
2131    const allocator = std.testing.allocator;
2132
2133    var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2134    const random = prng.random();
2135
2136    const total_size: usize = 50000;
2137    const message = try allocator.alloc(u8, total_size);
2138    defer allocator.free(message);
2139    random.bytes(message);
2140
2141    const customization = "random test domain";
2142
2143    var output_oneshot: [48]u8 = undefined;
2144    var output_incremental: [48]u8 = undefined;
2145
2146    try KT128.hash(message, &output_oneshot, .{ .customization = customization });
2147
2148    var hasher = KT128.init(.{ .customization = customization });
2149    var offset: usize = 0;
2150
2151    while (offset < message.len) {
2152        const remaining = message.len - offset;
2153        const max_chunk = @min(5000, remaining);
2154        const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2155
2156        hasher.update(message[offset..][0..chunk_size_local]);
2157        offset += chunk_size_local;
2158    }
2159    hasher.final(&output_incremental);
2160
2161    try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2162}