master
   1const std = @import("std");
   2const builtin = @import("builtin");
   3const fmt = std.fmt;
   4const mem = std.mem;
   5const Io = std.Io;
   6const Thread = std.Thread;
   7
   8const Vec4 = @Vector(4, u32);
   9const Vec8 = @Vector(8, u32);
  10const Vec16 = @Vector(16, u32);
  11
  12const chunk_length = 1024;
  13const max_depth = 54;
  14
  15const simd_degree = std.simd.suggestVectorLength(u32) orelse 1;
  16const max_simd_degree = simd_degree;
  17const max_simd_degree_or_2 = if (max_simd_degree > 2) max_simd_degree else 2;
  18
  19/// Threshold for switching to parallel processing.
  20/// Below this size, sequential hashing is used.
  21/// Benchmarks generally show significant speedup starting at 3 MiB.
  22const parallel_threshold = 3 * 1024 * 1024;
  23
  24const iv: [8]u32 = .{
  25    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
  26    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
  27};
  28
  29const msg_schedule: [7][16]u8 = .{
  30    .{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
  31    .{ 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 },
  32    .{ 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 },
  33    .{ 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 },
  34    .{ 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 },
  35    .{ 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 },
  36    .{ 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 },
  37};
  38
  39const Flags = packed struct(u8) {
  40    chunk_start: bool = false,
  41    chunk_end: bool = false,
  42    parent: bool = false,
  43    root: bool = false,
  44    keyed_hash: bool = false,
  45    derive_key_context: bool = false,
  46    derive_key_material: bool = false,
  47    reserved: bool = false,
  48
  49    fn toInt(self: Flags) u8 {
  50        return @bitCast(self);
  51    }
  52
  53    fn with(self: Flags, other: Flags) Flags {
  54        return @bitCast(self.toInt() | other.toInt());
  55    }
  56};
  57
  58const rotr = std.math.rotr;
  59
  60inline fn rotr32(w: u32, c: u5) u32 {
  61    return rotr(u32, w, c);
  62}
  63
  64inline fn load32(bytes: []const u8) u32 {
  65    return mem.readInt(u32, bytes[0..4], .little);
  66}
  67
  68inline fn store32(bytes: []u8, w: u32) void {
  69    mem.writeInt(u32, bytes[0..4], w, .little);
  70}
  71
  72fn loadKeyWords(key: [Blake3.key_length]u8) [8]u32 {
  73    var key_words: [8]u32 = undefined;
  74    for (0..8) |i| {
  75        key_words[i] = load32(key[i * 4 ..][0..4]);
  76    }
  77    return key_words;
  78}
  79
  80fn storeCvWords(cv_words: [8]u32) [Blake3.digest_length]u8 {
  81    var bytes: [Blake3.digest_length]u8 = undefined;
  82    for (0..8) |i| {
  83        store32(bytes[i * 4 ..][0..4], cv_words[i]);
  84    }
  85    return bytes;
  86}
  87
  88fn loadCvWords(bytes: [Blake3.digest_length]u8) [8]u32 {
  89    var cv_words: [8]u32 = undefined;
  90    for (0..8) |i| {
  91        cv_words[i] = load32(bytes[i * 4 ..][0..4]);
  92    }
  93    return cv_words;
  94}
  95
  96inline fn counterLow(counter: u64) u32 {
  97    return @truncate(counter);
  98}
  99
 100inline fn counterHigh(counter: u64) u32 {
 101    return @truncate(counter >> 32);
 102}
 103
 104fn highestOne(x: u64) u6 {
 105    if (x == 0) return 0;
 106    return @intCast(63 - @clz(x));
 107}
 108
 109fn roundDownToPowerOf2(x: u64) u64 {
 110    return @as(u64, 1) << highestOne(x | 1);
 111}
 112
 113inline fn g(state: *[16]u32, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) void {
 114    state[a] +%= state[b] +% x;
 115    state[d] = rotr32(state[d] ^ state[a], 16);
 116    state[c] +%= state[d];
 117    state[b] = rotr32(state[b] ^ state[c], 12);
 118    state[a] +%= state[b] +% y;
 119    state[d] = rotr32(state[d] ^ state[a], 8);
 120    state[c] +%= state[d];
 121    state[b] = rotr32(state[b] ^ state[c], 7);
 122}
 123
 124inline fn roundFn(state: *[16]u32, msg: *const [16]u32, round: usize) void {
 125    const schedule = &msg_schedule[round];
 126
 127    g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
 128    g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
 129    g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
 130    g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
 131
 132    g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
 133    g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
 134    g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
 135    g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
 136}
 137
 138fn compressPre(state: *[16]u32, cv: *const [8]u32, block: []const u8, block_len: u8, counter: u64, flags: Flags) void {
 139    var block_words: [16]u32 = undefined;
 140    for (0..16) |i| {
 141        block_words[i] = load32(block[i * 4 ..][0..4]);
 142    }
 143
 144    for (0..8) |i| {
 145        state[i] = cv[i];
 146    }
 147    for (0..4) |i| {
 148        state[i + 8] = iv[i];
 149    }
 150    state[12] = counterLow(counter);
 151    state[13] = counterHigh(counter);
 152    state[14] = @as(u32, block_len);
 153    state[15] = @as(u32, flags.toInt());
 154
 155    for (0..7) |round| {
 156        roundFn(state, &block_words, round);
 157    }
 158}
 159
 160fn compressInPlace(cv: *[8]u32, block: []const u8, block_len: u8, counter: u64, flags: Flags) void {
 161    var state: [16]u32 = undefined;
 162    compressPre(&state, cv, block, block_len, counter, flags);
 163    for (0..8) |i| {
 164        cv[i] = state[i] ^ state[i + 8];
 165    }
 166}
 167
 168fn compressXof(cv: *const [8]u32, block: []const u8, block_len: u8, counter: u64, flags: Flags, out: *[64]u8) void {
 169    var state: [16]u32 = undefined;
 170    compressPre(&state, cv, block, block_len, counter, flags);
 171
 172    for (0..8) |i| {
 173        store32(out[i * 4 ..][0..4], state[i] ^ state[i + 8]);
 174    }
 175    for (0..8) |i| {
 176        store32(out[(i + 8) * 4 ..][0..4], state[i + 8] ^ cv[i]);
 177    }
 178}
 179
 180fn hashOne(input: []const u8, blocks: usize, key: [8]u32, counter: u64, flags: Flags, flags_start: Flags, flags_end: Flags) [Blake3.digest_length]u8 {
 181    var cv = key;
 182    var block_flags = flags.with(flags_start);
 183    var inp = input;
 184    var remaining_blocks = blocks;
 185
 186    while (remaining_blocks > 0) {
 187        if (remaining_blocks == 1) {
 188            block_flags = block_flags.with(flags_end);
 189        }
 190        compressInPlace(&cv, inp[0..Blake3.block_length], Blake3.block_length, counter, block_flags);
 191        inp = inp[Blake3.block_length..];
 192        remaining_blocks -= 1;
 193        block_flags = flags;
 194    }
 195
 196    return storeCvWords(cv);
 197}
 198
 199fn hashManyPortable(inputs: [][*]const u8, num_inputs: usize, blocks: usize, key: [8]u32, counter_arg: u64, increment_counter: bool, flags: Flags, flags_start: Flags, flags_end: Flags, out: []u8) void {
 200    var counter = counter_arg;
 201    for (0..num_inputs) |i| {
 202        const input = inputs[i][0 .. blocks * Blake3.block_length];
 203        const result = hashOne(input, blocks, key, counter, flags, flags_start, flags_end);
 204        @memcpy(out[i * Blake3.digest_length ..][0..Blake3.digest_length], &result);
 205        if (increment_counter) {
 206            counter += 1;
 207        }
 208    }
 209}
 210
 211fn transposeNxN(comptime Vec: type, comptime n: comptime_int, vecs: *[n]Vec) void {
 212    const temp: [n]Vec = vecs.*;
 213
 214    inline for (0..n) |i| {
 215        inline for (0..n) |j| {
 216            vecs[i][j] = temp[j][i];
 217        }
 218    }
 219}
 220
 221fn transposeMsg(comptime Vec: type, comptime n: comptime_int, inputs: [n][*]const u8, block_offset: usize, out: *[16]Vec) void {
 222    const info = @typeInfo(Vec);
 223    if (info != .vector) @compileError("transposeMsg requires a vector type");
 224    if (info.vector.len != n) @compileError("vector width must match N");
 225
 226    var temp: [n][16]u32 = undefined;
 227
 228    for (0..n) |i| {
 229        const block = inputs[i] + block_offset;
 230        for (0..16) |j| {
 231            temp[i][j] = load32(block[j * 4 ..][0..4]);
 232        }
 233    }
 234
 235    for (0..16) |j| {
 236        var result: Vec = undefined;
 237        inline for (0..n) |i| {
 238            result[i] = temp[i][j];
 239        }
 240        out[j] = result;
 241    }
 242}
 243
 244fn roundFnVec(comptime Vec: type, v: *[16]Vec, m: *const [16]Vec, r: usize) void {
 245    const schedule = &msg_schedule[r];
 246
 247    // Column round - first half
 248    inline for (0..4) |i| {
 249        v[i] +%= m[schedule[i * 2]];
 250    }
 251    inline for (0..4) |i| {
 252        v[i] +%= v[i + 4];
 253    }
 254    inline for (0..4) |i| {
 255        v[i + 12] ^= v[i];
 256    }
 257    inline for (0..4) |i| {
 258        v[i + 12] = rotr(Vec, v[i + 12], 16);
 259    }
 260    inline for (0..4) |i| {
 261        v[i + 8] +%= v[i + 12];
 262    }
 263    inline for (0..4) |i| {
 264        v[i + 4] ^= v[i + 8];
 265    }
 266    inline for (0..4) |i| {
 267        v[i + 4] = rotr(Vec, v[i + 4], 12);
 268    }
 269
 270    // Column round - second half
 271    inline for (0..4) |i| {
 272        v[i] +%= m[schedule[i * 2 + 1]];
 273    }
 274    inline for (0..4) |i| {
 275        v[i] +%= v[i + 4];
 276    }
 277    inline for (0..4) |i| {
 278        v[i + 12] ^= v[i];
 279    }
 280    inline for (0..4) |i| {
 281        v[i + 12] = rotr(Vec, v[i + 12], 8);
 282    }
 283    inline for (0..4) |i| {
 284        v[i + 8] +%= v[i + 12];
 285    }
 286    inline for (0..4) |i| {
 287        v[i + 4] ^= v[i + 8];
 288    }
 289    inline for (0..4) |i| {
 290        v[i + 4] = rotr(Vec, v[i + 4], 7);
 291    }
 292
 293    // Diagonal round - first half
 294    inline for (0..4) |i| {
 295        v[i] +%= m[schedule[i * 2 + 8]];
 296    }
 297    const b_indices = [4]u8{ 5, 6, 7, 4 };
 298    inline for (0..4) |i| {
 299        v[i] +%= v[b_indices[i]];
 300    }
 301    const d_indices = [4]u8{ 15, 12, 13, 14 };
 302    inline for (0..4) |i| {
 303        v[d_indices[i]] ^= v[i];
 304    }
 305    inline for (0..4) |i| {
 306        v[d_indices[i]] = rotr(Vec, v[d_indices[i]], 16);
 307    }
 308    const c_indices = [4]u8{ 10, 11, 8, 9 };
 309    inline for (0..4) |i| {
 310        v[c_indices[i]] +%= v[d_indices[i]];
 311    }
 312    inline for (0..4) |i| {
 313        v[b_indices[i]] ^= v[c_indices[i]];
 314    }
 315    inline for (0..4) |i| {
 316        v[b_indices[i]] = rotr(Vec, v[b_indices[i]], 12);
 317    }
 318
 319    // Diagonal round - second half
 320    inline for (0..4) |i| {
 321        v[i] +%= m[schedule[i * 2 + 9]];
 322    }
 323    inline for (0..4) |i| {
 324        v[i] +%= v[b_indices[i]];
 325    }
 326    inline for (0..4) |i| {
 327        v[d_indices[i]] ^= v[i];
 328    }
 329    inline for (0..4) |i| {
 330        v[d_indices[i]] = rotr(Vec, v[d_indices[i]], 8);
 331    }
 332    inline for (0..4) |i| {
 333        v[c_indices[i]] +%= v[d_indices[i]];
 334    }
 335    inline for (0..4) |i| {
 336        v[b_indices[i]] ^= v[c_indices[i]];
 337    }
 338    inline for (0..4) |i| {
 339        v[b_indices[i]] = rotr(Vec, v[b_indices[i]], 7);
 340    }
 341}
 342
 343fn hashVec(
 344    comptime Vec: type,
 345    comptime n: comptime_int,
 346    inputs: [n][*]const u8,
 347    blocks: usize,
 348    key: [8]u32,
 349    counter: u64,
 350    increment_counter: bool,
 351    flags: Flags,
 352    flags_start: Flags,
 353    flags_end: Flags,
 354    out: *[n * Blake3.digest_length]u8,
 355) void {
 356    var h_vecs: [8]Vec = undefined;
 357    for (0..8) |i| {
 358        h_vecs[i] = @splat(key[i]);
 359    }
 360
 361    const counter_low_vec = if (increment_counter) blk: {
 362        var result: Vec = undefined;
 363        inline for (0..n) |i| {
 364            result[i] = counterLow(counter + i);
 365        }
 366        break :blk result;
 367    } else @as(Vec, @splat(counterLow(counter)));
 368
 369    const counter_high_vec = if (increment_counter) blk: {
 370        var result: Vec = undefined;
 371        inline for (0..n) |i| {
 372            result[i] = counterHigh(counter + i);
 373        }
 374        break :blk result;
 375    } else @as(Vec, @splat(counterHigh(counter)));
 376
 377    var block_flags = flags.with(flags_start);
 378
 379    for (0..blocks) |block| {
 380        if (block + 1 == blocks) {
 381            block_flags = block_flags.with(flags_end);
 382        }
 383
 384        const block_len_vec: Vec = @splat(Blake3.block_length);
 385        const block_flags_vec: Vec = @splat(@as(u32, block_flags.toInt()));
 386
 387        var msg_vecs: [16]Vec = undefined;
 388        transposeMsg(Vec, n, inputs, block * Blake3.block_length, &msg_vecs);
 389
 390        var v: [16]Vec = .{
 391            h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
 392            h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
 393            @splat(iv[0]),   @splat(iv[1]),    @splat(iv[2]), @splat(iv[3]),
 394            counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
 395        };
 396
 397        inline for (0..7) |r| {
 398            roundFnVec(Vec, &v, &msg_vecs, r);
 399        }
 400
 401        inline for (0..8) |i| {
 402            h_vecs[i] = v[i] ^ v[i + 8];
 403        }
 404
 405        block_flags = flags;
 406    }
 407
 408    // Output serialization - different strategies for different widths
 409    switch (n) {
 410        4 => {
 411            // Special interleaved pattern for Vec4
 412            var out_vecs = [4]Vec{ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3] };
 413            transposeNxN(Vec, 4, &out_vecs);
 414            inline for (0..4) |i| {
 415                mem.writeInt(u32, out[0 * 16 + i * 4 ..][0..4], out_vecs[0][i], .little);
 416            }
 417            inline for (0..4) |i| {
 418                mem.writeInt(u32, out[2 * 16 + i * 4 ..][0..4], out_vecs[1][i], .little);
 419            }
 420            inline for (0..4) |i| {
 421                mem.writeInt(u32, out[4 * 16 + i * 4 ..][0..4], out_vecs[2][i], .little);
 422            }
 423            inline for (0..4) |i| {
 424                mem.writeInt(u32, out[6 * 16 + i * 4 ..][0..4], out_vecs[3][i], .little);
 425            }
 426
 427            out_vecs = [4]Vec{ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7] };
 428            transposeNxN(Vec, 4, &out_vecs);
 429            inline for (0..4) |i| {
 430                mem.writeInt(u32, out[1 * 16 + i * 4 ..][0..4], out_vecs[0][i], .little);
 431            }
 432            inline for (0..4) |i| {
 433                mem.writeInt(u32, out[3 * 16 + i * 4 ..][0..4], out_vecs[1][i], .little);
 434            }
 435            inline for (0..4) |i| {
 436                mem.writeInt(u32, out[5 * 16 + i * 4 ..][0..4], out_vecs[2][i], .little);
 437            }
 438            inline for (0..4) |i| {
 439                mem.writeInt(u32, out[7 * 16 + i * 4 ..][0..4], out_vecs[3][i], .little);
 440            }
 441        },
 442        8 => {
 443            // Linear pattern with transpose for Vec8
 444            var out_vecs = [8]Vec{ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7] };
 445            transposeNxN(Vec, 8, &out_vecs);
 446            inline for (0..8) |i| {
 447                mem.writeInt(u32, out[0 * 32 + i * 4 ..][0..4], out_vecs[0][i], .little);
 448            }
 449            inline for (0..8) |i| {
 450                mem.writeInt(u32, out[1 * 32 + i * 4 ..][0..4], out_vecs[1][i], .little);
 451            }
 452            inline for (0..8) |i| {
 453                mem.writeInt(u32, out[2 * 32 + i * 4 ..][0..4], out_vecs[2][i], .little);
 454            }
 455            inline for (0..8) |i| {
 456                mem.writeInt(u32, out[3 * 32 + i * 4 ..][0..4], out_vecs[3][i], .little);
 457            }
 458            inline for (0..8) |i| {
 459                mem.writeInt(u32, out[4 * 32 + i * 4 ..][0..4], out_vecs[4][i], .little);
 460            }
 461            inline for (0..8) |i| {
 462                mem.writeInt(u32, out[5 * 32 + i * 4 ..][0..4], out_vecs[5][i], .little);
 463            }
 464            inline for (0..8) |i| {
 465                mem.writeInt(u32, out[6 * 32 + i * 4 ..][0..4], out_vecs[6][i], .little);
 466            }
 467            inline for (0..8) |i| {
 468                mem.writeInt(u32, out[7 * 32 + i * 4 ..][0..4], out_vecs[7][i], .little);
 469            }
 470        },
 471        16 => {
 472            // Direct lane-by-lane output for Vec16 (no transpose)
 473            inline for (0..16) |lane| {
 474                const hash_offset = lane * Blake3.digest_length;
 475                inline for (0..8) |word_idx| {
 476                    const word = h_vecs[word_idx][lane];
 477                    out[hash_offset + word_idx * 4 + 0] = @truncate(word);
 478                    out[hash_offset + word_idx * 4 + 1] = @truncate(word >> 8);
 479                    out[hash_offset + word_idx * 4 + 2] = @truncate(word >> 16);
 480                    out[hash_offset + word_idx * 4 + 3] = @truncate(word >> 24);
 481                }
 482            }
 483        },
 484        else => @compileError("Unsupported SIMD width"),
 485    }
 486}
 487
 488fn hashManySimd(
 489    inputs: [][*]const u8,
 490    num_inputs: usize,
 491    blocks: usize,
 492    key: [8]u32,
 493    counter: u64,
 494    increment_counter: bool,
 495    flags: Flags,
 496    flags_start: Flags,
 497    flags_end: Flags,
 498    out: []u8,
 499) void {
 500    var remaining = num_inputs;
 501    var inp = inputs.ptr;
 502    var out_ptr = out.ptr;
 503    var cnt = counter;
 504
 505    if (simd_degree >= 16) {
 506        while (remaining >= 16) {
 507            const sixteen_inputs = [16][*]const u8{
 508                inp[0],  inp[1],  inp[2],  inp[3],
 509                inp[4],  inp[5],  inp[6],  inp[7],
 510                inp[8],  inp[9],  inp[10], inp[11],
 511                inp[12], inp[13], inp[14], inp[15],
 512            };
 513
 514            var simd_out: [16 * Blake3.digest_length]u8 = undefined;
 515            hashVec(Vec16, 16, sixteen_inputs, blocks, key, cnt, increment_counter, flags, flags_start, flags_end, &simd_out);
 516
 517            @memcpy(out_ptr[0 .. 16 * Blake3.digest_length], &simd_out);
 518
 519            if (increment_counter) cnt += 16;
 520            inp += 16;
 521            remaining -= 16;
 522            out_ptr += 16 * Blake3.digest_length;
 523        }
 524    }
 525
 526    if (simd_degree >= 8) {
 527        while (remaining >= 8) {
 528            const eight_inputs = [8][*]const u8{
 529                inp[0], inp[1], inp[2], inp[3],
 530                inp[4], inp[5], inp[6], inp[7],
 531            };
 532
 533            var simd_out: [8 * Blake3.digest_length]u8 = undefined;
 534            hashVec(Vec8, 8, eight_inputs, blocks, key, cnt, increment_counter, flags, flags_start, flags_end, &simd_out);
 535
 536            @memcpy(out_ptr[0 .. 8 * Blake3.digest_length], &simd_out);
 537
 538            if (increment_counter) cnt += 8;
 539            inp += 8;
 540            remaining -= 8;
 541            out_ptr += 8 * Blake3.digest_length;
 542        }
 543    }
 544
 545    if (simd_degree >= 4) {
 546        while (remaining >= 4) {
 547            const four_inputs = [4][*]const u8{
 548                inp[0],
 549                inp[1],
 550                inp[2],
 551                inp[3],
 552            };
 553
 554            var simd_out: [4 * Blake3.digest_length]u8 = undefined;
 555            hashVec(Vec4, 4, four_inputs, blocks, key, cnt, increment_counter, flags, flags_start, flags_end, &simd_out);
 556
 557            @memcpy(out_ptr[0 .. 4 * Blake3.digest_length], &simd_out);
 558
 559            if (increment_counter) cnt += 4;
 560            inp += 4;
 561            remaining -= 4;
 562            out_ptr += 4 * Blake3.digest_length;
 563        }
 564    }
 565
 566    if (remaining > 0) {
 567        hashManyPortable(inp[0..remaining], remaining, blocks, key, cnt, increment_counter, flags, flags_start, flags_end, out_ptr[0 .. remaining * Blake3.digest_length]);
 568    }
 569}
 570
 571fn hashMany(inputs: [][*]const u8, num_inputs: usize, blocks: usize, key: [8]u32, counter: u64, increment_counter: bool, flags: Flags, flags_start: Flags, flags_end: Flags, out: []u8) void {
 572    if (max_simd_degree >= 4) {
 573        hashManySimd(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
 574    } else {
 575        hashManyPortable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
 576    }
 577}
 578
 579fn compressChunksParallel(input: []const u8, key: [8]u32, chunk_counter: u64, flags: Flags, out: []u8) usize {
 580    var chunks_array: [max_simd_degree][*]const u8 = undefined;
 581    var input_position: usize = 0;
 582    var chunks_array_len: usize = 0;
 583
 584    while (input.len - input_position >= chunk_length) {
 585        chunks_array[chunks_array_len] = input[input_position..].ptr;
 586        input_position += chunk_length;
 587        chunks_array_len += 1;
 588    }
 589
 590    hashMany(chunks_array[0..chunks_array_len], chunks_array_len, chunk_length / Blake3.block_length, key, chunk_counter, true, flags, .{ .chunk_start = true }, .{ .chunk_end = true }, out);
 591
 592    if (input.len > input_position) {
 593        const counter = chunk_counter + @as(u64, chunks_array_len);
 594        var chunk_state = ChunkState.init(key, flags);
 595        chunk_state.chunk_counter = counter;
 596        chunk_state.update(input[input_position..]);
 597        const output = chunk_state.output();
 598        const cv = output.chainingValue();
 599        const cv_bytes = storeCvWords(cv);
 600        @memcpy(out[chunks_array_len * Blake3.digest_length ..][0..Blake3.digest_length], &cv_bytes);
 601        return chunks_array_len + 1;
 602    } else {
 603        return chunks_array_len;
 604    }
 605}
 606
 607fn compressParentsParallel(child_chaining_values: []const u8, num_chaining_values: usize, key: [8]u32, flags: Flags, out: []u8) usize {
 608    var parents_array: [max_simd_degree_or_2][*]const u8 = undefined;
 609    var parents_array_len: usize = 0;
 610
 611    while (num_chaining_values - (2 * parents_array_len) >= 2) {
 612        parents_array[parents_array_len] = child_chaining_values[2 * parents_array_len * Blake3.digest_length ..].ptr;
 613        parents_array_len += 1;
 614    }
 615
 616    hashMany(parents_array[0..parents_array_len], parents_array_len, 1, key, 0, false, flags.with(.{ .parent = true }), .{}, .{}, out);
 617
 618    if (num_chaining_values > 2 * parents_array_len) {
 619        @memcpy(out[parents_array_len * Blake3.digest_length ..][0..Blake3.digest_length], child_chaining_values[2 * parents_array_len * Blake3.digest_length ..][0..Blake3.digest_length]);
 620        return parents_array_len + 1;
 621    } else {
 622        return parents_array_len;
 623    }
 624}
 625
 626fn compressSubtreeWide(input: []const u8, key: [8]u32, chunk_counter: u64, flags: Flags, out: []u8) usize {
 627    if (input.len <= max_simd_degree * chunk_length) {
 628        return compressChunksParallel(input, key, chunk_counter, flags, out);
 629    }
 630
 631    const left_input_len = leftSubtreeLen(input.len);
 632    const right_input = input[left_input_len..];
 633    const right_chunk_counter = chunk_counter + @as(u64, left_input_len / chunk_length);
 634
 635    var cv_array: [2 * max_simd_degree_or_2 * Blake3.digest_length]u8 = undefined;
 636    var degree: usize = max_simd_degree;
 637    if (left_input_len > chunk_length and degree == 1) {
 638        degree = 2;
 639    }
 640    const right_cvs = cv_array[degree * Blake3.digest_length ..];
 641
 642    const left_n = compressSubtreeWide(input[0..left_input_len], key, chunk_counter, flags, cv_array[0..]);
 643    const right_n = compressSubtreeWide(right_input, key, right_chunk_counter, flags, right_cvs);
 644
 645    if (left_n == 1) {
 646        @memcpy(out[0 .. 2 * Blake3.digest_length], cv_array[0 .. 2 * Blake3.digest_length]);
 647        return 2;
 648    }
 649
 650    const num_chaining_values = left_n + right_n;
 651    return compressParentsParallel(&cv_array, num_chaining_values, key, flags, out);
 652}
 653
 654fn compressSubtreeToParentNode(input: []const u8, key: [8]u32, chunk_counter: u64, flags: Flags, out: *[2 * Blake3.digest_length]u8) void {
 655    var cv_array: [max_simd_degree_or_2 * Blake3.digest_length]u8 = undefined;
 656    var num_cvs = compressSubtreeWide(input, key, chunk_counter, flags, &cv_array);
 657
 658    if (max_simd_degree_or_2 > 2) {
 659        var out_array: [max_simd_degree_or_2 * Blake3.digest_length / 2]u8 = undefined;
 660        while (num_cvs > 2) {
 661            num_cvs = compressParentsParallel(&cv_array, num_cvs, key, flags, &out_array);
 662            @memcpy(cv_array[0 .. num_cvs * Blake3.digest_length], out_array[0 .. num_cvs * Blake3.digest_length]);
 663        }
 664    }
 665
 666    @memcpy(out, cv_array[0 .. 2 * Blake3.digest_length]);
 667}
 668
 669fn leftSubtreeLen(input_len: usize) usize {
 670    const full_chunks = (input_len - 1) / chunk_length;
 671    return @intCast(roundDownToPowerOf2(full_chunks) * chunk_length);
 672}
 673
 674const ChunkBatch = struct {
 675    input: []const u8,
 676    start_chunk: usize,
 677    end_chunk: usize,
 678    cvs: [][8]u32,
 679    key: [8]u32,
 680    flags: Flags,
 681
 682    fn process(ctx: ChunkBatch) void {
 683        var cv_buffer: [max_simd_degree * Blake3.digest_length]u8 = undefined;
 684        var chunk_idx = ctx.start_chunk;
 685
 686        while (chunk_idx < ctx.end_chunk) {
 687            const remaining = ctx.end_chunk - chunk_idx;
 688            const batch_size: usize = @min(remaining, max_simd_degree);
 689            const offset = chunk_idx * chunk_length;
 690            const batch_len = batch_size * chunk_length;
 691
 692            const num_cvs = compressChunksParallel(
 693                ctx.input[offset..][0..batch_len],
 694                ctx.key,
 695                chunk_idx,
 696                ctx.flags,
 697                &cv_buffer,
 698            );
 699
 700            for (0..num_cvs) |i| {
 701                const cv_bytes = cv_buffer[i * Blake3.digest_length ..][0..Blake3.digest_length];
 702                ctx.cvs[chunk_idx + i] = loadCvWords(cv_bytes.*);
 703            }
 704
 705            chunk_idx += batch_size;
 706        }
 707    }
 708};
 709
 710const ParentBatchContext = struct {
 711    input_cvs: [][8]u32,
 712    output_cvs: [][8]u32,
 713    start_idx: usize,
 714    end_idx: usize,
 715    key: [8]u32,
 716    flags: Flags,
 717};
 718
 719fn processParentBatch(ctx: ParentBatchContext) void {
 720    for (ctx.start_idx..ctx.end_idx) |i| {
 721        const output = parentOutputFromCvs(ctx.input_cvs[i * 2], ctx.input_cvs[i * 2 + 1], ctx.key, ctx.flags);
 722        ctx.output_cvs[i] = output.chainingValue();
 723    }
 724}
 725
 726fn processParentBatchSIMD(ctx: ParentBatchContext) void {
 727    const num_parents = ctx.end_idx - ctx.start_idx;
 728    if (num_parents == 0) return;
 729
 730    // Convert input CVs to bytes for SIMD processing
 731    var input_bytes: [max_simd_degree * 2 * Blake3.digest_length]u8 = undefined;
 732    var output_bytes: [max_simd_degree * Blake3.digest_length]u8 = undefined;
 733    var parents_array: [max_simd_degree][*]const u8 = undefined;
 734
 735    var processed: usize = 0;
 736    while (processed < num_parents) {
 737        const batch_size: usize = @min(num_parents - processed, max_simd_degree);
 738
 739        // Convert CV pairs to byte blocks for this batch
 740        for (0..batch_size) |i| {
 741            const pair_idx = ctx.start_idx + processed + i;
 742            const left_cv = ctx.input_cvs[pair_idx * 2];
 743            const right_cv = ctx.input_cvs[pair_idx * 2 + 1];
 744
 745            // Write left CV || right CV to form 64-byte parent block
 746            for (0..8) |j| {
 747                store32(input_bytes[i * 64 + j * 4 ..][0..4], left_cv[j]);
 748                store32(input_bytes[i * 64 + 32 + j * 4 ..][0..4], right_cv[j]);
 749            }
 750            parents_array[i] = input_bytes[i * 64 ..].ptr;
 751        }
 752
 753        hashMany(parents_array[0..batch_size], batch_size, 1, ctx.key, 0, false, ctx.flags.with(.{ .parent = true }), .{}, .{}, output_bytes[0 .. batch_size * Blake3.digest_length]);
 754
 755        for (0..batch_size) |i| {
 756            const output_idx = ctx.start_idx + processed + i;
 757            ctx.output_cvs[output_idx] = loadCvWords(output_bytes[i * Blake3.digest_length ..][0..Blake3.digest_length].*);
 758        }
 759
 760        processed += batch_size;
 761    }
 762}
 763
 764fn buildMerkleTreeLayerParallel(
 765    input_cvs: [][8]u32,
 766    output_cvs: [][8]u32,
 767    key: [8]u32,
 768    flags: Flags,
 769    io: Io,
 770) void {
 771    const num_parents = input_cvs.len / 2;
 772
 773    // Process sequentially with SIMD for smaller tree layers to avoid thread overhead
 774    // Tree layers shrink quickly, so only parallelize the first few large layers
 775    if (num_parents <= 1024) {
 776        processParentBatchSIMD(ParentBatchContext{
 777            .input_cvs = input_cvs,
 778            .output_cvs = output_cvs,
 779            .start_idx = 0,
 780            .end_idx = num_parents,
 781            .key = key,
 782            .flags = flags,
 783        });
 784        return;
 785    }
 786
 787    const num_workers = Thread.getCpuCount() catch 1;
 788    const parents_per_worker = (num_parents + num_workers - 1) / num_workers;
 789    var group: Io.Group = .init;
 790
 791    for (0..num_workers) |worker_id| {
 792        const start_idx = worker_id * parents_per_worker;
 793        if (start_idx >= num_parents) break;
 794
 795        group.async(io, processParentBatchSIMD, .{ParentBatchContext{
 796            .input_cvs = input_cvs,
 797            .output_cvs = output_cvs,
 798            .start_idx = start_idx,
 799            .end_idx = @min(start_idx + parents_per_worker, num_parents),
 800            .key = key,
 801            .flags = flags,
 802        }});
 803    }
 804    group.wait(io);
 805}
 806
 807fn parentOutput(parent_block: []const u8, key: [8]u32, flags: Flags) Output {
 808    var block: [Blake3.block_length]u8 = undefined;
 809    @memcpy(&block, parent_block[0..Blake3.block_length]);
 810    return Output{
 811        .input_cv = key,
 812        .block = block,
 813        .block_len = Blake3.block_length,
 814        .counter = 0,
 815        .flags = flags.with(.{ .parent = true }),
 816    };
 817}
 818
 819fn parentOutputFromCvs(left_cv: [8]u32, right_cv: [8]u32, key: [8]u32, flags: Flags) Output {
 820    var block: [Blake3.block_length]u8 align(16) = undefined;
 821    for (0..8) |i| {
 822        store32(block[i * 4 ..][0..4], left_cv[i]);
 823        store32(block[(i + 8) * 4 ..][0..4], right_cv[i]);
 824    }
 825    return Output{
 826        .input_cv = key,
 827        .block = block,
 828        .block_len = Blake3.block_length,
 829        .counter = 0,
 830        .flags = flags.with(.{ .parent = true }),
 831    };
 832}
 833
 834const ChunkState = struct {
 835    cv: [8]u32 align(16),
 836    chunk_counter: u64,
 837    buf: [Blake3.block_length]u8 align(16),
 838    buf_len: u8,
 839    blocks_compressed: u8,
 840    flags: Flags,
 841
 842    fn init(key: [8]u32, flags: Flags) ChunkState {
 843        return ChunkState{
 844            .cv = key,
 845            .chunk_counter = 0,
 846            .buf = @splat(0),
 847            .buf_len = 0,
 848            .blocks_compressed = 0,
 849            .flags = flags,
 850        };
 851    }
 852
 853    fn reset(self: *ChunkState, key: [8]u32, chunk_counter: u64) void {
 854        self.cv = key;
 855        self.chunk_counter = chunk_counter;
 856        self.blocks_compressed = 0;
 857        self.buf = @splat(0);
 858        self.buf_len = 0;
 859    }
 860
 861    fn len(self: *const ChunkState) usize {
 862        return (Blake3.block_length * @as(usize, self.blocks_compressed)) + @as(usize, self.buf_len);
 863    }
 864
 865    fn fillBuf(self: *ChunkState, input: []const u8) usize {
 866        const take = @min(Blake3.block_length - @as(usize, self.buf_len), input.len);
 867        @memcpy(self.buf[self.buf_len..][0..take], input[0..take]);
 868        self.buf_len += @intCast(take);
 869        return take;
 870    }
 871
 872    fn maybeStartFlag(self: *const ChunkState) Flags {
 873        return if (self.blocks_compressed == 0) .{ .chunk_start = true } else .{};
 874    }
 875
 876    fn update(self: *ChunkState, input: []const u8) void {
 877        var inp = input;
 878
 879        while (inp.len > 0) {
 880            if (self.buf_len == Blake3.block_length) {
 881                compressInPlace(&self.cv, &self.buf, Blake3.block_length, self.chunk_counter, self.flags.with(self.maybeStartFlag()));
 882                self.blocks_compressed += 1;
 883                self.buf = @splat(0);
 884                self.buf_len = 0;
 885            }
 886
 887            const take = self.fillBuf(inp);
 888            inp = inp[take..];
 889        }
 890    }
 891
 892    fn output(self: *const ChunkState) Output {
 893        const block_flags = self.flags.with(self.maybeStartFlag()).with(.{ .chunk_end = true });
 894        return Output{
 895            .input_cv = self.cv,
 896            .block = self.buf,
 897            .block_len = self.buf_len,
 898            .counter = self.chunk_counter,
 899            .flags = block_flags,
 900        };
 901    }
 902};
 903
 904const Output = struct {
 905    input_cv: [8]u32 align(16),
 906    block: [Blake3.block_length]u8 align(16),
 907    block_len: u8,
 908    counter: u64,
 909    flags: Flags,
 910
 911    fn chainingValue(self: *const Output) [8]u32 {
 912        var cv_words = self.input_cv;
 913        compressInPlace(&cv_words, &self.block, self.block_len, self.counter, self.flags);
 914        return cv_words;
 915    }
 916
 917    fn rootBytes(self: *const Output, seek: u64, out: []u8) void {
 918        if (out.len == 0) return;
 919
 920        var output_block_counter = seek / 64;
 921        const offset_within_block = @as(usize, @intCast(seek % 64));
 922        var out_remaining = out;
 923
 924        if (offset_within_block > 0) {
 925            var wide_buf: [64]u8 = undefined;
 926            compressXof(&self.input_cv, &self.block, self.block_len, output_block_counter, self.flags.with(.{ .root = true }), &wide_buf);
 927            const available_bytes = 64 - offset_within_block;
 928            const bytes = @min(out_remaining.len, available_bytes);
 929            @memcpy(out_remaining[0..bytes], wide_buf[offset_within_block..][0..bytes]);
 930            out_remaining = out_remaining[bytes..];
 931            output_block_counter += 1;
 932        }
 933
 934        while (out_remaining.len >= 64) {
 935            compressXof(&self.input_cv, &self.block, self.block_len, output_block_counter, self.flags.with(.{ .root = true }), out_remaining[0..64]);
 936            out_remaining = out_remaining[64..];
 937            output_block_counter += 1;
 938        }
 939
 940        if (out_remaining.len > 0) {
 941            var wide_buf: [64]u8 = undefined;
 942            compressXof(&self.input_cv, &self.block, self.block_len, output_block_counter, self.flags.with(.{ .root = true }), &wide_buf);
 943            @memcpy(out_remaining, wide_buf[0..out_remaining.len]);
 944        }
 945    }
 946};
 947
 948/// BLAKE3 is a cryptographic hash function that produces a 256-bit digest by default but also supports extendable output.
 949pub const Blake3 = struct {
 950    pub const block_length = 64;
 951    pub const digest_length = 32;
 952    pub const key_length = 32;
 953
 954    pub const Options = struct { key: ?[key_length]u8 = null };
 955    pub const KdfOptions = struct {};
 956
 957    key: [8]u32,
 958    chunk: ChunkState,
 959    cv_stack_len: u8,
 960    cv_stack: [max_depth + 1][8]u32,
 961
 962    /// Construct a new `Blake3` for the hash function, with an optional key
 963    pub fn init(options: Options) Blake3 {
 964        if (options.key) |key| {
 965            const key_words = loadKeyWords(key);
 966            return init_internal(key_words, .{ .keyed_hash = true });
 967        } else {
 968            return init_internal(iv, .{});
 969        }
 970    }
 971
 972    /// Construct a new `Blake3` for the key derivation function. The context
 973    /// string should be hardcoded, globally unique, and application-specific.
 974    pub fn initKdf(context: []const u8, options: KdfOptions) Blake3 {
 975        _ = options;
 976        var context_hasher = init_internal(iv, .{ .derive_key_context = true });
 977        context_hasher.update(context);
 978        var context_key: [key_length]u8 = undefined;
 979        context_hasher.final(&context_key);
 980        const context_key_words = loadKeyWords(context_key);
 981        return init_internal(context_key_words, .{ .derive_key_material = true });
 982    }
 983
 984    pub fn hash(b: []const u8, out: []u8, options: Options) void {
 985        var d = Blake3.init(options);
 986        d.update(b);
 987        d.final(out);
 988    }
 989
 990    pub fn hashParallel(b: []const u8, out: []u8, options: Options, allocator: std.mem.Allocator, io: Io) !void {
 991        if (b.len < parallel_threshold) {
 992            return hash(b, out, options);
 993        }
 994
 995        const key_words = if (options.key) |key| loadKeyWords(key) else iv;
 996        const flags: Flags = if (options.key != null) .{ .keyed_hash = true } else .{};
 997
 998        const num_full_chunks = b.len / chunk_length;
 999        const thread_count = Thread.getCpuCount() catch 1;
1000        if (thread_count <= 1 or num_full_chunks == 0) {
1001            return hash(b, out, options);
1002        }
1003
1004        const cvs = try allocator.alloc([8]u32, num_full_chunks);
1005        defer allocator.free(cvs);
1006
1007        // Process chunks in parallel
1008        const num_workers = thread_count;
1009        const chunks_per_worker = (num_full_chunks + num_workers - 1) / num_workers;
1010        var group: Io.Group = .init;
1011
1012        for (0..num_workers) |worker_id| {
1013            const start_chunk = worker_id * chunks_per_worker;
1014            if (start_chunk >= num_full_chunks) break;
1015
1016            group.async(io, ChunkBatch.process, .{ChunkBatch{
1017                .input = b,
1018                .start_chunk = start_chunk,
1019                .end_chunk = @min(start_chunk + chunks_per_worker, num_full_chunks),
1020                .cvs = cvs,
1021                .key = key_words,
1022                .flags = flags,
1023            }});
1024        }
1025        group.wait(io);
1026
1027        // Build Merkle tree in parallel layers using ping-pong buffers
1028        const max_intermediate_size = (num_full_chunks + 1) / 2;
1029        const buffer0 = try allocator.alloc([8]u32, max_intermediate_size);
1030        defer allocator.free(buffer0);
1031        const buffer1 = try allocator.alloc([8]u32, max_intermediate_size);
1032        defer allocator.free(buffer1);
1033
1034        var current_level = cvs;
1035        var next_level_buf = buffer0;
1036        var toggle = false;
1037
1038        while (current_level.len > 8) {
1039            const num_parents = current_level.len / 2;
1040            const has_odd = current_level.len % 2 == 1;
1041            const next_level_size = num_parents + @intFromBool(has_odd);
1042
1043            buildMerkleTreeLayerParallel(
1044                current_level[0 .. num_parents * 2],
1045                next_level_buf[0..num_parents],
1046                key_words,
1047                flags,
1048                io,
1049            );
1050
1051            if (has_odd) {
1052                next_level_buf[num_parents] = current_level[current_level.len - 1];
1053            }
1054
1055            current_level = next_level_buf[0..next_level_size];
1056            next_level_buf = if (toggle) buffer0 else buffer1;
1057            toggle = !toggle;
1058        }
1059
1060        // Finalize remaining small tree sequentially
1061        var hasher = init_internal(key_words, flags);
1062        for (current_level, 0..) |cv, i| hasher.pushCv(cv, i);
1063
1064        hasher.chunk.chunk_counter = num_full_chunks;
1065        const remaining_bytes = b.len % chunk_length;
1066        if (remaining_bytes > 0) {
1067            hasher.chunk.update(b[num_full_chunks * chunk_length ..]);
1068            hasher.mergeCvStack(hasher.chunk.chunk_counter);
1069        }
1070
1071        hasher.final(out);
1072    }
1073
1074    fn init_internal(key: [8]u32, flags: Flags) Blake3 {
1075        return Blake3{
1076            .key = key,
1077            .chunk = ChunkState.init(key, flags),
1078            .cv_stack_len = 0,
1079            .cv_stack = undefined,
1080        };
1081    }
1082
1083    fn mergeCvStack(self: *Blake3, total_len: u64) void {
1084        const post_merge_stack_len = @as(u8, @intCast(@popCount(total_len)));
1085        while (self.cv_stack_len > post_merge_stack_len) {
1086            const left_cv = self.cv_stack[self.cv_stack_len - 2];
1087            const right_cv = self.cv_stack[self.cv_stack_len - 1];
1088            const output = parentOutputFromCvs(left_cv, right_cv, self.key, self.chunk.flags);
1089            const cv = output.chainingValue();
1090            self.cv_stack[self.cv_stack_len - 2] = cv;
1091            self.cv_stack_len -= 1;
1092        }
1093    }
1094
1095    fn pushCv(self: *Blake3, new_cv: [8]u32, chunk_counter: u64) void {
1096        self.mergeCvStack(chunk_counter);
1097        self.cv_stack[self.cv_stack_len] = new_cv;
1098        self.cv_stack_len += 1;
1099    }
1100
1101    /// Add input to the hash state. This can be called any number of times.
1102    pub fn update(self: *Blake3, input: []const u8) void {
1103        if (input.len == 0) return;
1104
1105        var inp = input;
1106
1107        if (self.chunk.len() > 0) {
1108            const take = @min(chunk_length - self.chunk.len(), inp.len);
1109            self.chunk.update(inp[0..take]);
1110            inp = inp[take..];
1111            if (inp.len > 0) {
1112                const output = self.chunk.output();
1113                const chunk_cv = output.chainingValue();
1114                self.pushCv(chunk_cv, self.chunk.chunk_counter);
1115                self.chunk.reset(self.key, self.chunk.chunk_counter + 1);
1116            } else {
1117                return;
1118            }
1119        }
1120
1121        while (inp.len > chunk_length) {
1122            var subtree_len = roundDownToPowerOf2(inp.len);
1123            const count_so_far = self.chunk.chunk_counter * chunk_length;
1124
1125            while ((subtree_len - 1) & count_so_far != 0) {
1126                subtree_len /= 2;
1127            }
1128
1129            const subtree_chunks = subtree_len / chunk_length;
1130            if (subtree_len <= chunk_length) {
1131                var chunk_state = ChunkState.init(self.key, self.chunk.flags);
1132                chunk_state.chunk_counter = self.chunk.chunk_counter;
1133                chunk_state.update(inp[0..@intCast(subtree_len)]);
1134                const output = chunk_state.output();
1135                const cv = output.chainingValue();
1136                self.pushCv(cv, chunk_state.chunk_counter);
1137            } else {
1138                var cv_pair: [2 * digest_length]u8 = undefined;
1139                compressSubtreeToParentNode(inp[0..@intCast(subtree_len)], self.key, self.chunk.chunk_counter, self.chunk.flags, &cv_pair);
1140                const left_cv = loadCvWords(cv_pair[0..digest_length].*);
1141                const right_cv = loadCvWords(cv_pair[digest_length..][0..digest_length].*);
1142                self.pushCv(left_cv, self.chunk.chunk_counter);
1143                self.pushCv(right_cv, self.chunk.chunk_counter + (subtree_chunks / 2));
1144            }
1145            self.chunk.chunk_counter += subtree_chunks;
1146            inp = inp[@intCast(subtree_len)..];
1147        }
1148
1149        if (inp.len > 0) {
1150            self.chunk.update(inp);
1151            self.mergeCvStack(self.chunk.chunk_counter);
1152        }
1153    }
1154
1155    /// Finalize the hash and write any number of output bytes.
1156    pub fn final(self: *const Blake3, out: []u8) void {
1157        self.finalizeSeek(0, out);
1158    }
1159
1160    /// Finalize the hash and write any number of output bytes, starting at a given seek position.
1161    /// This is an XOF (extendable-output function) extension.
1162    pub fn finalizeSeek(self: *const Blake3, seek: u64, out: []u8) void {
1163        if (out.len == 0) return;
1164
1165        if (self.cv_stack_len == 0) {
1166            const output = self.chunk.output();
1167            output.rootBytes(seek, out);
1168            return;
1169        }
1170
1171        var output: Output = undefined;
1172        var cvs_remaining: usize = undefined;
1173
1174        if (self.chunk.len() > 0) {
1175            cvs_remaining = self.cv_stack_len;
1176            output = self.chunk.output();
1177        } else {
1178            cvs_remaining = self.cv_stack_len - 2;
1179            const left_cv = self.cv_stack[cvs_remaining];
1180            const right_cv = self.cv_stack[cvs_remaining + 1];
1181            output = parentOutputFromCvs(left_cv, right_cv, self.key, self.chunk.flags);
1182        }
1183
1184        while (cvs_remaining > 0) {
1185            cvs_remaining -= 1;
1186            const left_cv = self.cv_stack[cvs_remaining];
1187            const right_cv = output.chainingValue();
1188            output = parentOutputFromCvs(left_cv, right_cv, self.key, self.chunk.flags);
1189        }
1190
1191        output.rootBytes(seek, out);
1192    }
1193
1194    pub fn reset(self: *Blake3) void {
1195        self.chunk.reset(self.key, 0);
1196        self.cv_stack_len = 0;
1197    }
1198};
1199
1200// Use named type declarations to workaround crash with anonymous structs (issue #4373).
1201const ReferenceTest = struct {
1202    key: *const [Blake3.key_length]u8,
1203    context_string: []const u8,
1204    cases: []const ReferenceTestCase,
1205};
1206
1207const ReferenceTestCase = struct {
1208    input_len: usize,
1209    hash: *const [262]u8,
1210    keyed_hash: *const [262]u8,
1211    derive_key: *const [262]u8,
1212};
1213
1214// Each test is an input length and three outputs, one for each of the `hash`, `keyed_hash`, and
1215// `derive_key` modes. The input in each case is filled with a 251-byte-long repeating pattern:
1216// 0, 1, 2, ..., 249, 250, 0, 1, ... The key used with `keyed_hash` is the 32-byte ASCII string
1217// given in the `key` field below. For `derive_key`, the test input is used as the input key, and
1218// the context string is 'BLAKE3 2019-12-27 16:29:52 test vectors context'. (As good practice for
1219// following the security requirements of `derive_key`, test runners should make that context
1220// string a hardcoded constant, and we do not provided it in machine-readable form.) Outputs are
1221// encoded as hexadecimal. Each case is an extended output, and implementations should also check
1222// that the first 32 bytes match their default-length output.
1223//
1224// Source: https://github.com/BLAKE3-team/BLAKE3/blob/92d421dea1a89e2f079f4dbd93b0dab41234b279/test_vectors/test_vectors.json
1225const reference_test = ReferenceTest{
1226    .key = "whats the Elvish word for friend",
1227    .context_string = "BLAKE3 2019-12-27 16:29:52 test vectors context",
1228    .cases = &[_]ReferenceTestCase{
1229        .{
1230            .input_len = 0,
1231            .hash = "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d",
1232            .keyed_hash = "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f",
1233            .derive_key = "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0",
1234        },
1235        .{
1236            .input_len = 1,
1237            .hash = "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5",
1238            .keyed_hash = "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11",
1239            .derive_key = "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551",
1240        },
1241        .{
1242            .input_len = 1023,
1243            .hash = "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485",
1244            .keyed_hash = "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10",
1245            .derive_key = "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d",
1246        },
1247        .{
1248            .input_len = 1024,
1249            .hash = "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e",
1250            .keyed_hash = "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de",
1251            .derive_key = "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad",
1252        },
1253        .{
1254            .input_len = 1025,
1255            .hash = "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a",
1256            .keyed_hash = "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930",
1257            .derive_key = "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad",
1258        },
1259        .{
1260            .input_len = 2048,
1261            .hash = "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9",
1262            .keyed_hash = "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe",
1263            .derive_key = "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583",
1264        },
1265        .{
1266            .input_len = 2049,
1267            .hash = "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3",
1268            .keyed_hash = "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e",
1269            .derive_key = "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6",
1270        },
1271        .{
1272            .input_len = 3072,
1273            .hash = "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11",
1274            .keyed_hash = "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b",
1275            .derive_key = "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0",
1276        },
1277        .{
1278            .input_len = 3073,
1279            .hash = "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf",
1280            .keyed_hash = "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5",
1281            .derive_key = "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5",
1282        },
1283        .{
1284            .input_len = 4096,
1285            .hash = "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620",
1286            .keyed_hash = "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de",
1287            .derive_key = "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245",
1288        },
1289        .{
1290            .input_len = 4097,
1291            .hash = "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956",
1292            .keyed_hash = "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f",
1293            .derive_key = "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad",
1294        },
1295        .{
1296            .input_len = 5120,
1297            .hash = "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059",
1298            .keyed_hash = "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e",
1299            .derive_key = "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d",
1300        },
1301        .{
1302            .input_len = 5121,
1303            .hash = "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95",
1304            .keyed_hash = "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d",
1305            .derive_key = "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165",
1306        },
1307        .{
1308            .input_len = 6144,
1309            .hash = "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83",
1310            .keyed_hash = "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e",
1311            .derive_key = "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef",
1312        },
1313        .{
1314            .input_len = 6145,
1315            .hash = "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022",
1316            .keyed_hash = "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c",
1317            .derive_key = "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2",
1318        },
1319        .{
1320            .input_len = 7168,
1321            .hash = "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95",
1322            .keyed_hash = "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52",
1323            .derive_key = "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88",
1324        },
1325        .{
1326            .input_len = 7169,
1327            .hash = "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8",
1328            .keyed_hash = "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54",
1329            .derive_key = "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd",
1330        },
1331        .{
1332            .input_len = 8192,
1333            .hash = "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf",
1334            .keyed_hash = "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102",
1335            .derive_key = "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7",
1336        },
1337        .{
1338            .input_len = 8193,
1339            .hash = "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6",
1340            .keyed_hash = "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57",
1341            .derive_key = "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0",
1342        },
1343        .{
1344            .input_len = 16384,
1345            .hash = "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893",
1346            .keyed_hash = "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65",
1347            .derive_key = "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57",
1348        },
1349        .{
1350            .input_len = 31744,
1351            .hash = "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f",
1352            .keyed_hash = "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec",
1353            .derive_key = "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b",
1354        },
1355        .{
1356            .input_len = 102400,
1357            .hash = "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e01c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e",
1358            .keyed_hash = "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4",
1359            .derive_key = "4652cff7a3f385a6103b5c260fc1593e13c778dbe608efb092fe7ee69df6e9c6d83a3e041bc3a48df2879f4a0a3ed40e7c961c73eff740f3117a0504c2dff4786d44fb17f1549eb0ba585e40ec29bf7732f0b7e286ff8acddc4cb1e23b87ff5d824a986458dcc6a04ac83969b80637562953df51ed1a7e90a7926924d2763778be8560",
1360        },
1361    },
1362};
1363
1364fn testBlake3(hasher: *Blake3, input_len: usize, expected_hex: [262]u8) !void {
1365    // Save initial state
1366    const initial_state = hasher.*;
1367
1368    // Setup input pattern
1369    var input_pattern: [251]u8 = undefined;
1370    for (&input_pattern, 0..) |*e, i| e.* = @as(u8, @truncate(i));
1371
1372    // Write repeating input pattern to hasher
1373    var input_counter = input_len;
1374    while (input_counter > 0) {
1375        const update_len = @min(input_counter, input_pattern.len);
1376        hasher.update(input_pattern[0..update_len]);
1377        input_counter -= update_len;
1378    }
1379
1380    // Read final hash value
1381    var actual_bytes: [expected_hex.len / 2]u8 = undefined;
1382    hasher.final(actual_bytes[0..]);
1383
1384    // Compare to expected value
1385    var expected_bytes: [expected_hex.len / 2]u8 = undefined;
1386    _ = fmt.hexToBytes(expected_bytes[0..], expected_hex[0..]) catch unreachable;
1387    try std.testing.expectEqual(actual_bytes, expected_bytes);
1388
1389    // Restore initial state
1390    hasher.* = initial_state;
1391}
1392
1393test "BLAKE3 reference test cases" {
1394    var hash_state = Blake3.init(.{});
1395    const hash = &hash_state;
1396    var keyed_hash_state = Blake3.init(.{ .key = reference_test.key.* });
1397    const keyed_hash = &keyed_hash_state;
1398    var derive_key_state = Blake3.initKdf(reference_test.context_string, .{});
1399    const derive_key = &derive_key_state;
1400
1401    for (reference_test.cases) |t| {
1402        try testBlake3(hash, t.input_len, t.hash.*);
1403        try testBlake3(keyed_hash, t.input_len, t.keyed_hash.*);
1404        try testBlake3(derive_key, t.input_len, t.derive_key.*);
1405    }
1406}
1407
1408test "BLAKE3 parallel vs sequential" {
1409    const allocator = std.testing.allocator;
1410    const io = std.testing.io;
1411
1412    // Test various sizes including those above the parallelization threshold
1413    const test_sizes = [_]usize{
1414        0, // Empty
1415        64, // One block
1416        1024, // One chunk
1417        1024 * 10, // Multiple chunks
1418        1024 * 100, // 100KB
1419        1024 * 1000, // 1MB
1420        1024 * 5000, // 5MB (above threshold)
1421        1024 * 10000, // 10MB (above threshold)
1422    };
1423
1424    for (test_sizes) |size| {
1425        // Allocate and fill test data with a pattern
1426        const input = try allocator.alloc(u8, size);
1427        defer allocator.free(input);
1428        for (input, 0..) |*byte, i| {
1429            byte.* = @truncate(i);
1430        }
1431
1432        // Test regular hash
1433        var expected: [32]u8 = undefined;
1434        Blake3.hash(input, &expected, .{});
1435
1436        var actual: [32]u8 = undefined;
1437        try Blake3.hashParallel(input, &actual, .{}, allocator, io);
1438
1439        try std.testing.expectEqualSlices(u8, &expected, &actual);
1440
1441        // Test keyed hash
1442        const key: [32]u8 = @splat(0x42);
1443        var expected_keyed: [32]u8 = undefined;
1444        Blake3.hash(input, &expected_keyed, .{ .key = key });
1445
1446        var actual_keyed: [32]u8 = undefined;
1447        try Blake3.hashParallel(input, &actual_keyed, .{ .key = key }, allocator, io);
1448
1449        try std.testing.expectEqualSlices(u8, &expected_keyed, &actual_keyed);
1450    }
1451}