master
1const std = @import("std");
2const builtin = @import("builtin");
3const crypto = std.crypto;
4const Allocator = std.mem.Allocator;
5const Io = std.Io;
6const Thread = std.Thread;
7
8const TurboSHAKE128State = crypto.hash.sha3.TurboShake128(0x06);
9const TurboSHAKE256State = crypto.hash.sha3.TurboShake256(0x06);
10
11const chunk_size: usize = 8192; // Chunk size for tree hashing (8 KiB)
12const cache_line_size = std.atomic.cache_line;
13
14// Optimal SIMD vector length for u64 on this target platform
15const optimal_vector_len = std.simd.suggestVectorLength(u64) orelse 1;
16
17// Number of bytes processed per SIMD batch in multi-threaded mode
18const bytes_per_batch = 256 * 1024;
19
20// Multi-threading threshold: inputs larger than this will use parallel processing.
21// Benchmarked optimal value for ReleaseFast mode.
22const large_file_threshold: usize = 2 * 1024 * 1024; // 2 MB
23
24// Round constants for Keccak-p[1600,12]
25const RC = [12]u64{
26 0x000000008000808B,
27 0x800000000000008B,
28 0x8000000000008089,
29 0x8000000000008003,
30 0x8000000000008002,
31 0x8000000000000080,
32 0x000000000000800A,
33 0x800000008000000A,
34 0x8000000080008081,
35 0x8000000000008080,
36 0x0000000080000001,
37 0x8000000080008008,
38};
39
40/// Generic KangarooTwelve variant builder.
41/// Creates a variant type with specific cryptographic parameters.
42fn KangarooVariant(
43 comptime security_level_bits: comptime_int,
44 comptime rate_bytes: usize,
45 comptime cv_size_bytes: usize,
46 comptime StateTypeParam: type,
47 comptime sep_x: usize,
48 comptime sep_y: usize,
49 comptime pad_x: usize,
50 comptime pad_y: usize,
51 comptime toBufferFn: fn (*const MultiSliceView, u8, []u8) void,
52 comptime allocFn: fn (Allocator, *const MultiSliceView, u8, usize) anyerror![]u8,
53) type {
54 return struct {
55 const security_level = security_level_bits;
56 const rate = rate_bytes;
57 const rate_in_lanes = rate_bytes / 8;
58 const cv_size = cv_size_bytes;
59 const StateType = StateTypeParam;
60 const separation_byte_pos = .{ .x = sep_x, .y = sep_y };
61 const padding_pos = .{ .x = pad_x, .y = pad_y };
62
63 inline fn turboShakeToBuffer(view: *const MultiSliceView, separation_byte: u8, output: []u8) void {
64 toBufferFn(view, separation_byte, output);
65 }
66
67 inline fn turboShakeMultiSliceAlloc(
68 allocator: Allocator,
69 view: *const MultiSliceView,
70 separation_byte: u8,
71 output_len: usize,
72 ) ![]u8 {
73 return allocFn(allocator, view, separation_byte, output_len);
74 }
75 };
76}
77
78/// KangarooTwelve with 128-bit security parameters
79const KT128Variant = KangarooVariant(
80 128, // Security level in bits
81 168, // TurboSHAKE128 rate in bytes
82 32, // Chaining value size in bytes
83 TurboSHAKE128State,
84 1, // separation_byte_pos.x (lane 11: 88 bytes into 168-byte rate)
85 3, // separation_byte_pos.y
86 0, // padding_pos.x (lane 20: last lane of 168-byte rate)
87 4, // padding_pos.y
88 turboShake128MultiSliceToBuffer,
89 turboShake128MultiSlice,
90);
91
92/// KangarooTwelve with 256-bit security parameters
93const KT256Variant = KangarooVariant(
94 256, // Security level in bits
95 136, // TurboSHAKE256 rate in bytes
96 64, // Chaining value size in bytes
97 TurboSHAKE256State,
98 4, // separation_byte_pos.x (lane 4: 32 bytes into 136-byte rate)
99 0, // separation_byte_pos.y
100 1, // padding_pos.x (lane 16: last lane of 136-byte rate)
101 3, // padding_pos.y
102 turboShake256MultiSliceToBuffer,
103 turboShake256MultiSlice,
104);
105
106/// Rotate left for u64 vector
107inline fn rol64Vec(comptime N: usize, v: @Vector(N, u64), comptime n: u6) @Vector(N, u64) {
108 if (n == 0) return v;
109 const left: @Vector(N, u64) = @splat(n);
110 const right_shift: u64 = 64 - @as(u64, n);
111 const right: @Vector(N, u64) = @splat(right_shift);
112 return (v << left) | (v >> right);
113}
114
115/// Load a 64-bit little-endian value
116inline fn load64(bytes: []const u8) u64 {
117 return std.mem.readInt(u64, bytes[0..8], .little);
118}
119
120/// Store a 64-bit little-endian value
121inline fn store64(value: u64, bytes: []u8) void {
122 std.mem.writeInt(u64, bytes[0..8], value, .little);
123}
124
125/// Right-encode result type (max 9 bytes for 64-bit usize)
126const RightEncoded = struct {
127 bytes: [9]u8,
128 len: u8,
129
130 fn slice(self: *const RightEncoded) []const u8 {
131 return self.bytes[0..self.len];
132 }
133};
134
135/// Right-encode: encodes a number as bytes with length suffix (no allocation)
136fn rightEncode(x: usize) RightEncoded {
137 var result: RightEncoded = undefined;
138
139 if (x == 0) {
140 result.bytes[0] = 0;
141 result.len = 1;
142 return result;
143 }
144
145 var temp: [9]u8 = undefined;
146 var len: usize = 0;
147 var val = x;
148
149 while (val > 0) : (val /= 256) {
150 temp[len] = @intCast(val % 256);
151 len += 1;
152 }
153
154 // Reverse bytes (MSB first)
155 for (0..len) |i| {
156 result.bytes[i] = temp[len - 1 - i];
157 }
158 result.bytes[len] = @intCast(len);
159 result.len = @intCast(len + 1);
160
161 return result;
162}
163
164/// Virtual contiguous view over multiple slices (zero-copy)
165const MultiSliceView = struct {
166 slices: [3][]const u8,
167 offsets: [4]usize,
168
169 fn init(s1: []const u8, s2: []const u8, s3: []const u8) MultiSliceView {
170 return .{
171 .slices = .{ s1, s2, s3 },
172 .offsets = .{
173 0,
174 s1.len,
175 s1.len + s2.len,
176 s1.len + s2.len + s3.len,
177 },
178 };
179 }
180
181 fn totalLen(self: *const MultiSliceView) usize {
182 return self.offsets[3];
183 }
184
185 /// Get byte at position (zero-copy)
186 fn getByte(self: *const MultiSliceView, pos: usize) u8 {
187 for (0..3) |i| {
188 if (pos >= self.offsets[i] and pos < self.offsets[i + 1]) {
189 return self.slices[i][pos - self.offsets[i]];
190 }
191 }
192 unreachable;
193 }
194
195 /// Try to get a contiguous slice [start..end) - returns null if spans boundaries
196 fn tryGetSlice(self: *const MultiSliceView, start: usize, end: usize) ?[]const u8 {
197 for (0..3) |i| {
198 if (start >= self.offsets[i] and end <= self.offsets[i + 1]) {
199 const local_start = start - self.offsets[i];
200 const local_end = end - self.offsets[i];
201 return self.slices[i][local_start..local_end];
202 }
203 }
204 return null;
205 }
206
207 /// Copy range [start..end) to buffer (used when slice spans boundaries)
208 fn copyRange(self: *const MultiSliceView, start: usize, end: usize, buffer: []u8) void {
209 var pos: usize = 0;
210 for (start..end) |i| {
211 buffer[pos] = self.getByte(i);
212 pos += 1;
213 }
214 }
215};
216
217/// Apply Keccak-p[1600,12] to N states using SIMD
218fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
219 @setEvalBranchQuota(10000);
220
221 // Pre-computed rotation offsets for rho-pi step
222 const rho_offsets = comptime blk: {
223 var offsets: [24]u6 = undefined;
224 var px: usize = 1;
225 var py: usize = 0;
226 for (0..24) |t| {
227 const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
228 offsets[t] = @intCast(rot_amount);
229 const temp_x = py;
230 py = (2 * px + 3 * py) % 5;
231 px = temp_x;
232 }
233 break :blk offsets;
234 };
235
236 var round: usize = 0;
237 while (round < 12) : (round += 2) {
238 inline for (0..2) |i| {
239 // θ (theta)
240 var C: [5]@Vector(N, u64) = undefined;
241 inline for (0..5) |x| {
242 C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4];
243 }
244
245 var D: [5]@Vector(N, u64) = undefined;
246 inline for (0..5) |x| {
247 D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1);
248 }
249
250 // Apply D to all lanes
251 inline for (0..5) |x| {
252 states[x][0] ^= D[x];
253 states[x][1] ^= D[x];
254 states[x][2] ^= D[x];
255 states[x][3] ^= D[x];
256 states[x][4] ^= D[x];
257 }
258
259 // ρ (rho) and π (pi) - optimized with pre-computed offsets
260 var current = states[1][0];
261 var px: usize = 1;
262 var py: usize = 0;
263 inline for (rho_offsets) |rot| {
264 const next_y = (2 * px + 3 * py) % 5;
265 const next = states[py][next_y];
266 states[py][next_y] = rol64Vec(N, current, rot);
267 current = next;
268 px = py;
269 py = next_y;
270 }
271
272 // χ (chi) - optimized with better register usage
273 inline for (0..5) |y| {
274 const t0 = states[0][y];
275 const t1 = states[1][y];
276 const t2 = states[2][y];
277 const t3 = states[3][y];
278 const t4 = states[4][y];
279
280 states[0][y] = t0 ^ (~t1 & t2);
281 states[1][y] = t1 ^ (~t2 & t3);
282 states[2][y] = t2 ^ (~t3 & t4);
283 states[3][y] = t3 ^ (~t4 & t0);
284 states[4][y] = t4 ^ (~t0 & t1);
285 }
286
287 // ι (iota)
288 const rc_splat: @Vector(N, u64) = @splat(RC[round + i]);
289 states[0][0] ^= rc_splat;
290 }
291 }
292}
293
294/// Add lanes from data to N states in parallel with stride using SIMD
295fn addLanesAll(
296 comptime N: usize,
297 states: *[5][5]@Vector(N, u64),
298 data: []const u8,
299 lane_count: usize,
300 lane_offset: usize,
301) void {
302
303 // Process lanes (at most 25 lanes in Keccak state)
304 inline for (0..25) |xy| {
305 if (xy < lane_count) {
306 const x = xy % 5;
307 const y = xy / 5;
308
309 var loaded_data: @Vector(N, u64) = undefined;
310 inline for (0..N) |i| {
311 loaded_data[i] = load64(data[8 * (i * lane_offset + xy) ..]);
312 }
313 states[x][y] ^= loaded_data;
314 }
315 }
316}
317
318/// Apply Keccak-p[1600,12] to a single state (byte representation)
319fn keccakP(state: *[200]u8) void {
320 @setEvalBranchQuota(10000);
321 var lanes: [5][5]u64 = undefined;
322
323 // Load state into lanes
324 inline for (0..5) |x| {
325 inline for (0..5) |y| {
326 lanes[x][y] = load64(state[8 * (x + 5 * y) ..]);
327 }
328 }
329
330 // Apply 12 rounds
331 var round: usize = 0;
332 while (round < 12) : (round += 2) {
333 inline for (0..2) |i| {
334 // θ
335 var C: [5]u64 = undefined;
336 inline for (0..5) |x| {
337 C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4];
338 }
339 var D: [5]u64 = undefined;
340 inline for (0..5) |x| {
341 D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1);
342 }
343 inline for (0..5) |x| {
344 inline for (0..5) |y| {
345 lanes[x][y] ^= D[x];
346 }
347 }
348
349 // ρ and π
350 var current = lanes[1][0];
351 var px: usize = 1;
352 var py: usize = 0;
353 inline for (0..24) |t| {
354 const temp = lanes[py][(2 * px + 3 * py) % 5];
355 const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
356 lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount)));
357 current = temp;
358 const temp_x = py;
359 py = (2 * px + 3 * py) % 5;
360 px = temp_x;
361 }
362
363 // χ
364 inline for (0..5) |y| {
365 const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] };
366 inline for (0..5) |x| {
367 lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]);
368 }
369 }
370
371 // ι
372 lanes[0][0] ^= RC[round + i];
373 }
374 }
375
376 // Store lanes back to state
377 inline for (0..5) |x| {
378 inline for (0..5) |y| {
379 store64(lanes[x][y], state[8 * (x + 5 * y) ..]);
380 }
381 }
382}
383
384/// Apply Keccak-p[1600,12] to a single state (u64 lane representation)
385fn keccakPLanes(lanes: *[25]u64) void {
386 @setEvalBranchQuota(10000);
387
388 // Apply 12 rounds
389 inline for (RC) |rc| {
390 // θ
391 var C: [5]u64 = undefined;
392 inline for (0..5) |x| {
393 C[x] = lanes[x] ^ lanes[x + 5] ^ lanes[x + 10] ^ lanes[x + 15] ^ lanes[x + 20];
394 }
395 var D: [5]u64 = undefined;
396 inline for (0..5) |x| {
397 D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1);
398 }
399 inline for (0..5) |x| {
400 inline for (0..5) |y| {
401 lanes[x + 5 * y] ^= D[x];
402 }
403 }
404
405 // ρ and π
406 var current = lanes[1];
407 var px: usize = 1;
408 var py: usize = 0;
409 inline for (0..24) |t| {
410 const next_y = (2 * px + 3 * py) % 5;
411 const next_idx = py + 5 * next_y;
412 const temp = lanes[next_idx];
413 const rot_amount = ((t + 1) * (t + 2) / 2) % 64;
414 lanes[next_idx] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount)));
415 current = temp;
416 px = py;
417 py = next_y;
418 }
419
420 // χ
421 inline for (0..5) |y| {
422 const idx = 5 * y;
423 const T = [5]u64{ lanes[idx], lanes[idx + 1], lanes[idx + 2], lanes[idx + 3], lanes[idx + 4] };
424 inline for (0..5) |x| {
425 lanes[idx + x] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]);
426 }
427 }
428
429 // ι
430 lanes[0] ^= rc;
431 }
432}
433
434/// Generic non-allocating TurboSHAKE: write output to provided buffer
435fn turboShakeMultiSliceToBuffer(
436 comptime rate: usize,
437 view: *const MultiSliceView,
438 separation_byte: u8,
439 output: []u8,
440) void {
441 var state: [200]u8 = @splat(0);
442 var state_pos: usize = 0;
443
444 // Absorb all bytes from the multi-slice view
445 const total = view.totalLen();
446 var pos: usize = 0;
447 while (pos < total) {
448 state[state_pos] ^= view.getByte(pos);
449 state_pos += 1;
450 pos += 1;
451
452 if (state_pos == rate) {
453 keccakP(&state);
454 state_pos = 0;
455 }
456 }
457
458 // Add separation byte and padding
459 state[state_pos] ^= separation_byte;
460 state[rate - 1] ^= 0x80;
461 keccakP(&state);
462
463 // Squeeze
464 var out_offset: usize = 0;
465 while (out_offset < output.len) {
466 const chunk = @min(rate, output.len - out_offset);
467 @memcpy(output[out_offset..][0..chunk], state[0..chunk]);
468 out_offset += chunk;
469 if (out_offset < output.len) {
470 keccakP(&state);
471 }
472 }
473}
474
475/// Generic allocating TurboSHAKE
476fn turboShakeMultiSlice(
477 comptime rate: usize,
478 allocator: Allocator,
479 view: *const MultiSliceView,
480 separation_byte: u8,
481 output_len: usize,
482) ![]u8 {
483 const output = try allocator.alloc(u8, output_len);
484 turboShakeMultiSliceToBuffer(rate, view, separation_byte, output);
485 return output;
486}
487
488/// Non-allocating TurboSHAKE128: write output to provided buffer
489fn turboShake128MultiSliceToBuffer(
490 view: *const MultiSliceView,
491 separation_byte: u8,
492 output: []u8,
493) void {
494 turboShakeMultiSliceToBuffer(168, view, separation_byte, output);
495}
496
497/// Allocating TurboSHAKE128
498fn turboShake128MultiSlice(
499 allocator: Allocator,
500 view: *const MultiSliceView,
501 separation_byte: u8,
502 output_len: usize,
503) ![]u8 {
504 return turboShakeMultiSlice(168, allocator, view, separation_byte, output_len);
505}
506
507/// Non-allocating TurboSHAKE256: write output to provided buffer
508fn turboShake256MultiSliceToBuffer(
509 view: *const MultiSliceView,
510 separation_byte: u8,
511 output: []u8,
512) void {
513 turboShakeMultiSliceToBuffer(136, view, separation_byte, output);
514}
515
516/// Allocating TurboSHAKE256
517fn turboShake256MultiSlice(
518 allocator: Allocator,
519 view: *const MultiSliceView,
520 separation_byte: u8,
521 output_len: usize,
522) ![]u8 {
523 return turboShakeMultiSlice(136, allocator, view, separation_byte, output_len);
524}
525
526/// Process N leaves (8KiB chunks) in parallel - generic version
527fn processLeaves(
528 comptime Variant: type,
529 comptime N: usize,
530 data: []const u8,
531 result: *[N * Variant.cv_size]u8,
532) void {
533 const rate_in_lanes: usize = Variant.rate_in_lanes;
534 const rate_in_bytes: usize = rate_in_lanes * 8;
535 const cv_size: usize = Variant.cv_size;
536
537 // Initialize N all-zero states with cache alignment
538 var states: [5][5]@Vector(N, u64) align(cache_line_size) = undefined;
539 inline for (0..5) |x| {
540 inline for (0..5) |y| {
541 states[x][y] = @splat(0);
542 }
543 }
544
545 // Process complete blocks
546 var j: usize = 0;
547 while (j + rate_in_bytes <= chunk_size) : (j += rate_in_bytes) {
548 addLanesAll(N, &states, data[j..], rate_in_lanes, chunk_size / 8);
549 keccakP1600timesN(N, &states);
550 }
551
552 // Process last incomplete block
553 const remaining_lanes = (chunk_size - j) / 8;
554 if (remaining_lanes > 0) {
555 addLanesAll(N, &states, data[j..], remaining_lanes, chunk_size / 8);
556 }
557
558 // Add suffix 0x0B and padding
559 const suffix_pos = Variant.separation_byte_pos;
560 const padding_pos = Variant.padding_pos;
561
562 const suffix_splat: @Vector(N, u64) = @splat(0x0B);
563 states[suffix_pos.x][suffix_pos.y] ^= suffix_splat;
564 const padding_splat: @Vector(N, u64) = @splat(0x8000000000000000);
565 states[padding_pos.x][padding_pos.y] ^= padding_splat;
566
567 keccakP1600timesN(N, &states);
568
569 // Extract chaining values from each state
570 const lanes_to_extract = cv_size / 8;
571 comptime var lane_idx: usize = 0;
572 inline while (lane_idx < lanes_to_extract) : (lane_idx += 1) {
573 const x = lane_idx % 5;
574 const y = lane_idx / 5;
575 inline for (0..N) |i| {
576 store64(states[x][y][i], result[i * cv_size + lane_idx * 8 ..]);
577 }
578 }
579}
580
581/// Context for processing a batch of leaves in a thread
582const LeafBatchContext = struct {
583 output_cvs: []align(@alignOf(u64)) u8,
584 batch_start: usize,
585 batch_count: usize,
586 view: *const MultiSliceView,
587 scratch_buffer: []u8, // Pre-allocated scratch space (no allocations in worker)
588 total_len: usize, // Total length of input data (for boundary checking)
589};
590
591/// Helper function to process N leaves in parallel, reducing code duplication
592inline fn processNLeaves(
593 comptime Variant: type,
594 comptime N: usize,
595 view: *const MultiSliceView,
596 j: usize,
597 leaf_buffer: []u8,
598 output: []align(@alignOf(u64)) u8,
599) void {
600 const cv_size = Variant.cv_size;
601 comptime std.debug.assert(cv_size % @sizeOf(u64) == 0);
602
603 if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| {
604 var leaf_cvs: [N * cv_size]u8 = undefined;
605 processLeaves(Variant, N, leaf_data, &leaf_cvs);
606 @memcpy(output[0..leaf_cvs.len], &leaf_cvs);
607 } else {
608 view.copyRange(j, j + N * chunk_size, leaf_buffer[0 .. N * chunk_size]);
609 var leaf_cvs: [N * cv_size]u8 = undefined;
610 processLeaves(Variant, N, leaf_buffer[0 .. N * chunk_size], &leaf_cvs);
611 @memcpy(output[0..leaf_cvs.len], &leaf_cvs);
612 }
613}
614
615/// Process a batch of leaves in a single thread using SIMD
616fn processLeafBatch(comptime Variant: type, ctx: LeafBatchContext) void {
617 const cv_size = Variant.cv_size;
618 const leaf_buffer = ctx.scratch_buffer[0 .. 8 * chunk_size];
619
620 var cvs_offset: usize = 0;
621 var j: usize = ctx.batch_start;
622 const batch_end = @min(ctx.batch_start + ctx.batch_count * chunk_size, ctx.total_len);
623
624 // Process leaves using SIMD (8x, 4x, 2x) based on optimal vector length
625 inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
626 while (optimal_vector_len >= batch_size and j + batch_size * chunk_size <= batch_end) {
627 processNLeaves(Variant, batch_size, ctx.view, j, leaf_buffer, @alignCast(ctx.output_cvs[cvs_offset..]));
628 cvs_offset += batch_size * cv_size;
629 j += batch_size * chunk_size;
630 }
631 }
632
633 // Process remaining single leaves
634 while (j < batch_end) {
635 const chunk_len = @min(chunk_size, batch_end - j);
636 if (ctx.view.tryGetSlice(j, j + chunk_len)) |leaf_data| {
637 const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
638 Variant.turboShakeToBuffer(&cv_slice, 0x0B, ctx.output_cvs[cvs_offset..][0..cv_size]);
639 } else {
640 ctx.view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]);
641 const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{});
642 Variant.turboShakeToBuffer(&cv_slice, 0x0B, ctx.output_cvs[cvs_offset..][0..cv_size]);
643 }
644 cvs_offset += cv_size;
645 j += chunk_len;
646 }
647
648 std.debug.assert(cvs_offset == ctx.output_cvs.len);
649}
650
651/// Helper to process N leaves in SIMD and absorb CVs into state
652inline fn processAndAbsorbNLeaves(
653 comptime Variant: type,
654 comptime N: usize,
655 view: *const MultiSliceView,
656 j: usize,
657 leaf_buffer: []u8,
658 final_state: anytype,
659) void {
660 const cv_size = Variant.cv_size;
661 if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| {
662 var leaf_cvs: [N * cv_size]u8 align(cache_line_size) = undefined;
663 processLeaves(Variant, N, leaf_data, &leaf_cvs);
664 final_state.update(&leaf_cvs);
665 } else {
666 view.copyRange(j, j + N * chunk_size, leaf_buffer[0 .. N * chunk_size]);
667 var leaf_cvs: [N * cv_size]u8 align(cache_line_size) = undefined;
668 processLeaves(Variant, N, leaf_buffer[0 .. N * chunk_size], &leaf_cvs);
669 final_state.update(&leaf_cvs);
670 }
671}
672
673/// Generic single-threaded implementation
674fn ktSingleThreaded(comptime Variant: type, view: *const MultiSliceView, total_len: usize, output: []u8) void {
675 const cv_size = Variant.cv_size;
676 const StateType = Variant.StateType;
677
678 // Initialize streaming TurboSHAKE state for final node (delimiter 0x06 is set in the type)
679 var final_state = StateType.init(.{});
680
681 // Absorb first B bytes from input
682 var first_b_buffer: [chunk_size]u8 = undefined;
683 if (view.tryGetSlice(0, chunk_size)) |first_chunk| {
684 final_state.update(first_chunk);
685 } else {
686 view.copyRange(0, chunk_size, &first_b_buffer);
687 final_state.update(&first_b_buffer);
688 }
689
690 // Absorb padding bytes (8 bytes: 0x03 followed by 7 zeros)
691 const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
692 final_state.update(&padding);
693
694 var j: usize = chunk_size;
695 var n: usize = 0;
696
697 // Temporary buffers for boundary-spanning leaves and CV computation
698 var leaf_buffer: [chunk_size * 8]u8 align(cache_line_size) = undefined;
699 var cv_buffer: [64]u8 = undefined; // Max CV size is 64 bytes
700
701 // Process leaves in SIMD batches (8x, 4x, 2x)
702 inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
703 while (optimal_vector_len >= batch_size and j + batch_size * chunk_size <= total_len) {
704 processAndAbsorbNLeaves(Variant, batch_size, view, j, &leaf_buffer, &final_state);
705 j += batch_size * chunk_size;
706 n += batch_size;
707 }
708 }
709
710 // Process remaining leaves one at a time
711 while (j < total_len) {
712 const chunk_len = @min(chunk_size, total_len - j);
713 if (view.tryGetSlice(j, j + chunk_len)) |leaf_data| {
714 const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
715 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
716 final_state.update(cv_buffer[0..cv_size]); // Absorb CV immediately
717 } else {
718 view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]);
719 const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{});
720 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
721 final_state.update(cv_buffer[0..cv_size]);
722 }
723 j += chunk_size;
724 n += 1;
725 }
726
727 // Absorb right_encode(n) and terminator
728 const n_enc = rightEncode(n);
729 final_state.update(n_enc.slice());
730 const terminator = [_]u8{ 0xFF, 0xFF };
731 final_state.update(&terminator);
732
733 // Finalize and squeeze output
734 final_state.final(output);
735}
736
737fn BatchResult(comptime Variant: type) type {
738 const cv_size = Variant.cv_size;
739 const leaves_per_batch = bytes_per_batch / chunk_size;
740 const max_cvs_size = leaves_per_batch * cv_size;
741
742 return struct {
743 batch_idx: usize,
744 cv_len: usize,
745 cvs: [max_cvs_size]u8,
746 };
747}
748
749fn SelectLeafContext(comptime Variant: type) type {
750 const cv_size = Variant.cv_size;
751 const Result = BatchResult(Variant);
752
753 return struct {
754 view: *const MultiSliceView,
755 batch_idx: usize,
756 start_offset: usize,
757 num_leaves: usize,
758
759 fn process(ctx: @This()) Result {
760 var result: Result = .{
761 .batch_idx = ctx.batch_idx,
762 .cv_len = ctx.num_leaves * cv_size,
763 .cvs = undefined,
764 };
765
766 var leaf_buffer: [bytes_per_batch]u8 align(cache_line_size) = undefined;
767 var leaves_processed: usize = 0;
768 var byte_offset = ctx.start_offset;
769 var cv_offset: usize = 0;
770 const simd_batch_bytes = optimal_vector_len * chunk_size;
771 while (leaves_processed + optimal_vector_len <= ctx.num_leaves) {
772 if (ctx.view.tryGetSlice(byte_offset, byte_offset + simd_batch_bytes)) |leaf_data| {
773 var leaf_cvs: [optimal_vector_len * Variant.cv_size]u8 = undefined;
774 processLeaves(Variant, optimal_vector_len, leaf_data, &leaf_cvs);
775 @memcpy(result.cvs[cv_offset..][0..leaf_cvs.len], &leaf_cvs);
776 } else {
777 ctx.view.copyRange(byte_offset, byte_offset + simd_batch_bytes, leaf_buffer[0..simd_batch_bytes]);
778 var leaf_cvs: [optimal_vector_len * Variant.cv_size]u8 = undefined;
779 processLeaves(Variant, optimal_vector_len, leaf_buffer[0..simd_batch_bytes], &leaf_cvs);
780 @memcpy(result.cvs[cv_offset..][0..leaf_cvs.len], &leaf_cvs);
781 }
782 leaves_processed += optimal_vector_len;
783 byte_offset += optimal_vector_len * chunk_size;
784 cv_offset += optimal_vector_len * cv_size;
785 }
786
787 while (leaves_processed < ctx.num_leaves) {
788 const leaf_end = byte_offset + chunk_size;
789 var cv_buffer: [64]u8 = undefined;
790
791 if (ctx.view.tryGetSlice(byte_offset, leaf_end)) |leaf_data| {
792 const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
793 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
794 } else {
795 ctx.view.copyRange(byte_offset, leaf_end, leaf_buffer[0..chunk_size]);
796 const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_size], &[_]u8{}, &[_]u8{});
797 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
798 }
799 @memcpy(result.cvs[cv_offset..][0..cv_size], cv_buffer[0..cv_size]);
800
801 leaves_processed += 1;
802 byte_offset += chunk_size;
803 cv_offset += cv_size;
804 }
805
806 return result;
807 }
808 };
809}
810
811fn FinalLeafContext(comptime Variant: type) type {
812 return struct {
813 view: *const MultiSliceView,
814 start_offset: usize,
815 leaf_len: usize,
816 output_cv: []align(@alignOf(u64)) u8,
817
818 fn process(ctx: @This()) void {
819 const cv_size = Variant.cv_size;
820 var leaf_buffer: [chunk_size]u8 = undefined;
821 var cv_buffer: [64]u8 = undefined;
822
823 if (ctx.view.tryGetSlice(ctx.start_offset, ctx.start_offset + ctx.leaf_len)) |leaf_data| {
824 const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
825 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
826 } else {
827 ctx.view.copyRange(ctx.start_offset, ctx.start_offset + ctx.leaf_len, leaf_buffer[0..ctx.leaf_len]);
828 const cv_slice = MultiSliceView.init(leaf_buffer[0..ctx.leaf_len], &[_]u8{}, &[_]u8{});
829 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
830 }
831 @memcpy(ctx.output_cv[0..cv_size], cv_buffer[0..cv_size]);
832 }
833 };
834}
835
836fn ktMultiThreaded(
837 comptime Variant: type,
838 allocator: Allocator,
839 io: Io,
840 view: *const MultiSliceView,
841 total_len: usize,
842 output: []u8,
843) !void {
844 comptime std.debug.assert(bytes_per_batch % (optimal_vector_len * chunk_size) == 0);
845
846 const cv_size = Variant.cv_size;
847 const StateType = Variant.StateType;
848 const leaves_per_batch = bytes_per_batch / chunk_size;
849 const remaining_bytes = total_len - chunk_size;
850 const total_leaves = std.math.divCeil(usize, remaining_bytes, chunk_size) catch unreachable;
851
852 var final_state = StateType.init(.{});
853
854 var first_chunk_buffer: [chunk_size]u8 = undefined;
855 if (view.tryGetSlice(0, chunk_size)) |first_chunk| {
856 final_state.update(first_chunk);
857 } else {
858 view.copyRange(0, chunk_size, &first_chunk_buffer);
859 final_state.update(&first_chunk_buffer);
860 }
861
862 const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
863 final_state.update(&padding);
864
865 const full_leaves = remaining_bytes / chunk_size;
866 const has_partial_leaf = (remaining_bytes % chunk_size) != 0;
867 const partial_leaf_size = if (has_partial_leaf) remaining_bytes % chunk_size else 0;
868
869 if (full_leaves > 0) {
870 const total_batches = std.math.divCeil(usize, full_leaves, leaves_per_batch) catch unreachable;
871 const max_concurrent: usize = @min(256, total_batches);
872
873 const Result = BatchResult(Variant);
874 const SelectResult = union(enum) { batch: Result };
875 const Select = Io.Select(SelectResult);
876
877 const select_buf = try allocator.alloc(SelectResult, max_concurrent);
878 defer allocator.free(select_buf);
879
880 // Buffer for out-of-order results (select_buf slots get reused)
881 const pending_cv_buf = try allocator.alloc([leaves_per_batch * cv_size]u8, max_concurrent);
882 defer allocator.free(pending_cv_buf);
883 var pending_cv_lens: [256]usize = .{0} ** 256;
884
885 var select: Select = .init(io, select_buf);
886 var batches_spawned: usize = 0;
887 var next_to_process: usize = 0;
888
889 while (next_to_process < total_batches) {
890 while (batches_spawned < total_batches and batches_spawned - next_to_process < max_concurrent) {
891 const batch_start_leaf = batches_spawned * leaves_per_batch;
892 const batch_leaves = @min(leaves_per_batch, full_leaves - batch_start_leaf);
893 const start_offset = chunk_size + batch_start_leaf * chunk_size;
894
895 select.async(.batch, SelectLeafContext(Variant).process, .{SelectLeafContext(Variant){
896 .view = view,
897 .batch_idx = batches_spawned,
898 .start_offset = start_offset,
899 .num_leaves = batch_leaves,
900 }});
901 batches_spawned += 1;
902 }
903
904 const result = select.wait() catch unreachable;
905 const batch = result.batch;
906 const slot = batch.batch_idx % max_concurrent;
907
908 if (batch.batch_idx == next_to_process) {
909 final_state.update(batch.cvs[0..batch.cv_len]);
910 next_to_process += 1;
911
912 // Drain pending batches that are now ready
913 while (next_to_process < total_batches) {
914 const pending_slot = next_to_process % max_concurrent;
915 const pending_len = pending_cv_lens[pending_slot];
916 if (pending_len == 0) break;
917
918 final_state.update(pending_cv_buf[pending_slot][0..pending_len]);
919 pending_cv_lens[pending_slot] = 0;
920 next_to_process += 1;
921 }
922 } else {
923 @memcpy(pending_cv_buf[slot][0..batch.cv_len], batch.cvs[0..batch.cv_len]);
924 pending_cv_lens[slot] = batch.cv_len;
925 }
926 }
927
928 select.group.wait(io);
929 }
930
931 if (has_partial_leaf) {
932 var cv_buffer: [64]u8 = undefined;
933 var leaf_buffer: [chunk_size]u8 = undefined;
934
935 const start_offset = chunk_size + full_leaves * chunk_size;
936 if (view.tryGetSlice(start_offset, start_offset + partial_leaf_size)) |leaf_data| {
937 const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
938 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
939 } else {
940 view.copyRange(start_offset, start_offset + partial_leaf_size, leaf_buffer[0..partial_leaf_size]);
941 const cv_slice = MultiSliceView.init(leaf_buffer[0..partial_leaf_size], &[_]u8{}, &[_]u8{});
942 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
943 }
944 final_state.update(cv_buffer[0..cv_size]);
945 }
946
947 const n_enc = rightEncode(total_leaves);
948 final_state.update(n_enc.slice());
949 const terminator = [_]u8{ 0xFF, 0xFF };
950 final_state.update(&terminator);
951
952 final_state.final(output);
953}
954
955/// Generic KangarooTwelve hash function builder.
956/// Creates a public API type with hash and hashParallel methods for a specific variant.
957fn KTHash(
958 comptime Variant: type,
959 comptime singleChunkFn: fn (*const MultiSliceView, u8, []u8) void,
960) type {
961 return struct {
962 const Self = @This();
963 const StateType = Variant.StateType;
964
965 /// The recommended output length, in bytes.
966 pub const digest_length = Variant.security_level / 8 * 2;
967 /// The block length, or rate, in bytes.
968 pub const block_length = Variant.rate;
969
970 /// Configuration options for KangarooTwelve hashing.
971 ///
972 /// Options include an optional customization string that provides domain separation,
973 /// ensuring that identical inputs with different customization strings
974 /// produce completely distinct hash outputs.
975 ///
976 /// This prevents hash collisions when the same data is hashed in different contexts.
977 ///
978 /// Customization strings can be of any length.
979 ///
980 /// Common options for customization::
981 ///
982 /// - Key derivation or MAC: 16-byte secret for KT128, 32-byte secret for KT256
983 /// - Context Separation: domain-specific strings (e.g., "email", "password", "session")
984 /// - Composite Keys: concatenation of secret key + context string
985 pub const Options = struct {
986 customization: ?[]const u8 = null,
987 };
988
989 // Message buffer (accumulates message data only, not customization)
990 buffer: [chunk_size]u8,
991 buffer_len: usize,
992 message_len: usize,
993
994 // Customization string (fixed at init)
995 customization: []const u8,
996 custom_len_enc: RightEncoded,
997
998 // Tree mode state (lazy initialization when buffer overflows first time)
999 first_chunk: ?[chunk_size]u8, // Saved first chunk for tree mode
1000 final_state: ?StateType, // Running TurboSHAKE state for final node
1001 num_leaves: usize, // Count of leaves processed (after first chunk)
1002
1003 // SIMD chunk batching
1004 pending_chunks: [8 * chunk_size]u8 align(cache_line_size), // Buffer for up to 8 chunks
1005 pending_count: usize, // Number of complete chunks in pending_chunks
1006
1007 /// Initialize a KangarooTwelve hashing context.
1008 ///
1009 /// Options include an optional customization string that provides domain separation,
1010 /// ensuring that identical inputs with different customization strings
1011 /// produce completely distinct hash outputs.
1012 ///
1013 /// This prevents hash collisions when the same data is hashed in different contexts.
1014 ///
1015 /// Customization strings can be of any length.
1016 ///
1017 /// Common options for customization::
1018 ///
1019 /// - Key derivation or MAC: 16-byte secret for KT128, 32-byte secret for KT256
1020 /// - Context Separation: domain-specific strings (e.g., "email", "password", "session")
1021 /// - Composite Keys: concatenation of secret key + context string
1022 pub fn init(options: Options) Self {
1023 const custom = options.customization orelse &[_]u8{};
1024 return .{
1025 .buffer = undefined,
1026 .buffer_len = 0,
1027 .message_len = 0,
1028 .customization = custom,
1029 .custom_len_enc = rightEncode(custom.len),
1030 .first_chunk = null,
1031 .final_state = null,
1032 .num_leaves = 0,
1033 .pending_chunks = undefined,
1034 .pending_count = 0,
1035 };
1036 }
1037
1038 /// Flush all pending chunks using SIMD when possible
1039 fn flushPendingChunks(self: *Self) void {
1040 const cv_size = Variant.cv_size;
1041
1042 // Process all pending chunks using the largest SIMD batch sizes possible
1043 while (self.pending_count > 0) {
1044 // Try SIMD batches in decreasing size order
1045 inline for ([_]usize{ 8, 4, 2 }) |batch_size| {
1046 if (optimal_vector_len >= batch_size and self.pending_count >= batch_size) {
1047 var leaf_cvs: [batch_size * cv_size]u8 align(cache_line_size) = undefined;
1048 processLeaves(Variant, batch_size, self.pending_chunks[0 .. batch_size * chunk_size], &leaf_cvs);
1049 self.final_state.?.update(&leaf_cvs);
1050 self.num_leaves += batch_size;
1051 self.pending_count -= batch_size;
1052
1053 // Shift remaining chunks to the front
1054 if (self.pending_count > 0) {
1055 const remaining_bytes = self.pending_count * chunk_size;
1056 @memcpy(self.pending_chunks[0..remaining_bytes], self.pending_chunks[batch_size * chunk_size ..][0..remaining_bytes]);
1057 }
1058 break; // Continue outer loop to try next batch
1059 }
1060 }
1061
1062 // If no SIMD batch was possible, process one chunk with scalar code
1063 if (self.pending_count > 0 and self.pending_count < 2) {
1064 var cv_buffer: [64]u8 = undefined;
1065 const cv_slice = MultiSliceView.init(self.pending_chunks[0..chunk_size], &[_]u8{}, &[_]u8{});
1066 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1067 self.final_state.?.update(cv_buffer[0..cv_size]);
1068 self.num_leaves += 1;
1069 self.pending_count -= 1;
1070 break; // No more chunks to process
1071 }
1072 }
1073 }
1074
1075 /// Absorb data into the hash state.
1076 /// Can be called multiple times to incrementally add data.
1077 pub fn update(self: *Self, data: []const u8) void {
1078 if (data.len == 0) return;
1079
1080 var remaining = data;
1081
1082 while (remaining.len > 0) {
1083 const space_in_buffer = chunk_size - self.buffer_len;
1084 const to_copy = @min(space_in_buffer, remaining.len);
1085
1086 // Copy data into buffer
1087 @memcpy(self.buffer[self.buffer_len..][0..to_copy], remaining[0..to_copy]);
1088 self.buffer_len += to_copy;
1089 self.message_len += to_copy;
1090 remaining = remaining[to_copy..];
1091
1092 // If buffer is full, process it
1093 if (self.buffer_len == chunk_size) {
1094 if (self.first_chunk == null) {
1095 // First time buffer fills - initialize tree mode
1096 self.first_chunk = self.buffer;
1097 self.final_state = StateType.init(.{});
1098
1099 // Absorb first chunk into final state
1100 self.final_state.?.update(&self.buffer);
1101
1102 // Absorb padding (8 bytes: 0x03 followed by 7 zeros)
1103 const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
1104 self.final_state.?.update(&padding);
1105 } else {
1106 // Add chunk to pending buffer for SIMD batch processing
1107 @memcpy(self.pending_chunks[self.pending_count * chunk_size ..][0..chunk_size], &self.buffer);
1108 self.pending_count += 1;
1109
1110 // Flush when we have enough chunks for optimal SIMD batch
1111 // Determine best batch size for this architecture
1112 const optimal_batch_size = comptime blk: {
1113 if (optimal_vector_len >= 8) break :blk 8;
1114 if (optimal_vector_len >= 4) break :blk 4;
1115 if (optimal_vector_len >= 2) break :blk 2;
1116 break :blk 1;
1117 };
1118 if (self.pending_count >= optimal_batch_size) {
1119 self.flushPendingChunks();
1120 }
1121 }
1122 self.buffer_len = 0;
1123 }
1124 }
1125 }
1126
1127 /// Finalize the hash and produce output.
1128 ///
1129 /// Unlike traditional hash functions, the output can be of any length.
1130 ///
1131 /// When using as a regular hash function, use the recommended `digest_length` value (32 bytes for KT128, 64 bytes for KT256).
1132 ///
1133 /// After calling this method, the context should not be reused. However, the structure can be cloned before finalizing
1134 /// to compute multiple hashes with the same prefix.
1135 pub fn final(self: *Self, out: []u8) void {
1136 const cv_size = Variant.cv_size;
1137
1138 // Calculate total length: message + customization + right_encode(customization.len)
1139 const total_len = self.message_len + self.customization.len + self.custom_len_enc.len;
1140
1141 // Single chunk mode: total data fits in one chunk
1142 if (total_len <= chunk_size) {
1143 // Build the complete input: buffer + customization + encoded length
1144 var single_chunk: [chunk_size]u8 = undefined;
1145 @memcpy(single_chunk[0..self.buffer_len], self.buffer[0..self.buffer_len]);
1146 @memcpy(single_chunk[self.buffer_len..][0..self.customization.len], self.customization);
1147 @memcpy(single_chunk[self.buffer_len + self.customization.len ..][0..self.custom_len_enc.len], self.custom_len_enc.slice());
1148
1149 const view = MultiSliceView.init(single_chunk[0..total_len], &[_]u8{}, &[_]u8{});
1150 singleChunkFn(&view, 0x07, out);
1151 return;
1152 }
1153
1154 // Flush any pending chunks with SIMD
1155 self.flushPendingChunks();
1156
1157 // Build view over remaining data (buffer + customization + encoding)
1158 const remaining_view = MultiSliceView.init(
1159 self.buffer[0..self.buffer_len],
1160 self.customization,
1161 self.custom_len_enc.slice(),
1162 );
1163 const remaining_len = remaining_view.totalLen();
1164
1165 var final_leaves = self.num_leaves;
1166 var leaf_start: usize = 0;
1167
1168 // Tree mode: initialize if not already done (lazy initialization)
1169 if (self.final_state == null and remaining_len > 0) {
1170 self.final_state = StateType.init(.{});
1171
1172 // Absorb first chunk (up to chunk_size bytes from remaining data)
1173 const first_chunk_len = @min(chunk_size, remaining_len);
1174 if (remaining_view.tryGetSlice(0, first_chunk_len)) |first_chunk| {
1175 // Data is contiguous, use it directly
1176 self.final_state.?.update(first_chunk);
1177 } else {
1178 // Data spans boundaries, copy to buffer
1179 var first_chunk_buf: [chunk_size]u8 = undefined;
1180 remaining_view.copyRange(0, first_chunk_len, first_chunk_buf[0..first_chunk_len]);
1181 self.final_state.?.update(first_chunk_buf[0..first_chunk_len]);
1182 }
1183
1184 // Absorb padding (8 bytes: 0x03 followed by 7 zeros)
1185 const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
1186 self.final_state.?.update(&padding);
1187
1188 // Process remaining data as leaves
1189 leaf_start = first_chunk_len;
1190 }
1191
1192 // Process all remaining data as leaves (starting from leaf_start)
1193 var offset = leaf_start;
1194 while (offset < remaining_len) {
1195 const leaf_end = @min(offset + chunk_size, remaining_len);
1196 const leaf_size = leaf_end - offset;
1197
1198 var cv_buffer: [64]u8 = undefined;
1199 if (remaining_view.tryGetSlice(offset, leaf_end)) |leaf_data| {
1200 // Data is contiguous, use it directly
1201 const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{});
1202 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1203 } else {
1204 // Data spans boundaries, copy to buffer
1205 var leaf_buf: [chunk_size]u8 = undefined;
1206 remaining_view.copyRange(offset, leaf_end, leaf_buf[0..leaf_size]);
1207 const cv_slice = MultiSliceView.init(leaf_buf[0..leaf_size], &[_]u8{}, &[_]u8{});
1208 Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]);
1209 }
1210 self.final_state.?.update(cv_buffer[0..cv_size]);
1211 final_leaves += 1;
1212 offset = leaf_end;
1213 }
1214
1215 // Absorb right_encode(num_leaves) and terminator
1216 const n_enc = rightEncode(final_leaves);
1217 self.final_state.?.update(n_enc.slice());
1218 const terminator = [_]u8{ 0xFF, 0xFF };
1219 self.final_state.?.update(&terminator);
1220
1221 // Squeeze output
1222 self.final_state.?.final(out);
1223 }
1224
1225 /// Hash a message using sequential processing with SIMD acceleration.
1226 ///
1227 /// Parameters:
1228 /// - message: Input data to hash (any length)
1229 /// - out: Output buffer (any length, arbitrary output sizes supported, `digest_length` recommended for standard use)
1230 /// - options: Optional settings to include a secret key or a context separation string
1231 pub fn hash(message: []const u8, out: []u8, options: Options) !void {
1232 const custom = options.customization orelse &[_]u8{};
1233
1234 // Right-encode customization length
1235 const custom_len_enc = rightEncode(custom.len);
1236
1237 // Create zero-copy multi-slice view (no concatenation)
1238 const view = MultiSliceView.init(message, custom, custom_len_enc.slice());
1239 const total_len = view.totalLen();
1240
1241 // Single chunk case - zero-copy absorption!
1242 if (total_len <= chunk_size) {
1243 singleChunkFn(&view, 0x07, out);
1244 return;
1245 }
1246
1247 // Tree mode - single-threaded SIMD processing
1248 ktSingleThreaded(Variant, &view, total_len, out);
1249 }
1250
1251 /// Hash with automatic parallelization for large inputs (>2MB).
1252 /// Automatically uses sequential processing for smaller inputs to avoid thread overhead.
1253 /// Allocator required for temporary buffers. IO object required for thread management.
1254 pub fn hashParallel(message: []const u8, out: []u8, options: Options, allocator: Allocator, io: Io) !void {
1255 const custom = options.customization orelse &[_]u8{};
1256
1257 const custom_len_enc = rightEncode(custom.len);
1258 const view = MultiSliceView.init(message, custom, custom_len_enc.slice());
1259 const total_len = view.totalLen();
1260
1261 // Single chunk case
1262 if (total_len <= chunk_size) {
1263 singleChunkFn(&view, 0x07, out);
1264 return;
1265 }
1266
1267 // Use single-threaded processing if below threshold
1268 if (total_len < large_file_threshold) {
1269 ktSingleThreaded(Variant, &view, total_len, out);
1270 return;
1271 }
1272
1273 // Tree mode - multi-threaded processing
1274 try ktMultiThreaded(Variant, allocator, io, &view, total_len, out);
1275 }
1276 };
1277}
1278
1279/// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing
1280/// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive
1281/// underlying SHA-3, which has undergone over 15 years of intensive cryptanalysis
1282/// since the SHA-3 competition (2008-2012) and remains secure.
1283///
1284/// K12 uses Keccak-p[1600,12] with 12 rounds (half of SHA-3's 24 rounds), providing
1285/// 128-bit security strength equivalent to AES-128 and SHAKE128. While this offers
1286/// less conservative margin than SHA-3, current cryptanalysis reaches only 6 rounds,
1287/// leaving a substantial security margin. This deliberate trade-off delivers
1288/// significantly better performance while maintaining strong practical security.
1289///
1290/// Standardized as RFC 9861 after 8 years of public scrutiny. Supports arbitrary-length
1291/// output and optional customization strings for domain separation.
1292pub const KT128 = KTHash(KT128Variant, turboShake128MultiSliceToBuffer);
1293
1294/// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing
1295/// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive
1296/// underlying SHA-3, which has undergone over 15 years of intensive cryptanalysis
1297/// since the SHA-3 competition (2008-2012) and remains secure.
1298///
1299/// KT256 provides 256-bit security strength and achieves NIST post-quantum security
1300/// level 2 when using at least 256-bit outputs. Like KT128, it uses Keccak-p[1600,12]
1301/// with 12 rounds, offering a deliberate trade-off between conservative margin and
1302/// significantly better performance while maintaining strong practical security.
1303///
1304/// Use KT256 when you need extra conservative margins.
1305/// For most applications, KT128 offers better performance with adequate security.
1306pub const KT256 = KTHash(KT256Variant, turboShake256MultiSliceToBuffer);
1307
1308test "KT128 sequential and parallel produce same output for small inputs" {
1309 const allocator = std.testing.allocator;
1310 const io = std.testing.io;
1311
1312 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1313 const random = prng.random();
1314
1315 // Test with different small input sizes
1316 const test_sizes = [_]usize{ 100, 1024, 4096, 8192 }; // 100B, 1KB, 4KB, 8KB
1317
1318 for (test_sizes) |size| {
1319 const input = try allocator.alloc(u8, size);
1320 defer allocator.free(input);
1321
1322 // Fill with random data
1323 random.bytes(input);
1324
1325 var output_seq: [32]u8 = undefined;
1326 var output_par: [32]u8 = undefined;
1327
1328 // Hash with sequential method
1329 try KT128.hash(input, &output_seq, .{});
1330
1331 // Hash with parallel method
1332 try KT128.hashParallel(input, &output_par, .{}, allocator, io);
1333
1334 // Verify outputs match
1335 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1336 }
1337}
1338
1339test "KT128 sequential and parallel produce same output for large inputs" {
1340 const allocator = std.testing.allocator;
1341 const io = std.testing.io;
1342
1343 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1344 const random = prng.random();
1345
1346 // Test with input sizes above the 2MB threshold to trigger parallel processing.
1347 // Include a size with partial final leaf to stress boundary handling.
1348 const test_sizes = [_]usize{
1349 5 * 512 * 1024, // 2.5 MB
1350 5 * 512 * 1024 + 8191, // 2.5 MB + 8191B (partial leaf)
1351 };
1352
1353 for (test_sizes) |size| {
1354 const input = try allocator.alloc(u8, size);
1355 defer allocator.free(input);
1356
1357 // Fill with random data
1358 random.bytes(input);
1359
1360 var output_seq: [64]u8 = undefined;
1361 var output_par: [64]u8 = undefined;
1362
1363 // Hash with sequential method
1364 try KT128.hash(input, &output_seq, .{});
1365
1366 // Hash with parallel method
1367 try KT128.hashParallel(input, &output_par, .{}, allocator, io);
1368
1369 // Verify outputs match
1370 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1371 }
1372}
1373
1374test "KT128 sequential and parallel produce same output for many random lengths" {
1375 const allocator = std.testing.allocator;
1376 const io = std.testing.io;
1377
1378 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1379 const random = prng.random();
1380
1381 const num_tests = if (builtin.mode == .Debug) 10 else 1000;
1382 const max_length = 250000;
1383
1384 for (0..num_tests) |_| {
1385 const length = random.intRangeAtMost(usize, 0, max_length);
1386
1387 const input = try allocator.alloc(u8, length);
1388 defer allocator.free(input);
1389
1390 random.bytes(input);
1391
1392 var output_seq: [32]u8 = undefined;
1393 var output_par: [32]u8 = undefined;
1394
1395 try KT128.hash(input, &output_seq, .{});
1396 try KT128.hashParallel(input, &output_par, .{}, allocator, io);
1397
1398 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1399 }
1400}
1401
1402test "KT128 sequential and parallel produce same output with customization" {
1403 const allocator = std.testing.allocator;
1404 const io = std.testing.io;
1405
1406 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1407 const random = prng.random();
1408
1409 const input_size = 5 * 512 * 1024; // 2.5MB
1410 const input = try allocator.alloc(u8, input_size);
1411 defer allocator.free(input);
1412
1413 // Fill with random data
1414 random.bytes(input);
1415
1416 const customization = "test domain";
1417 var output_seq: [48]u8 = undefined;
1418 var output_par: [48]u8 = undefined;
1419
1420 // Hash with sequential method
1421 try KT128.hash(input, &output_seq, .{ .customization = customization });
1422
1423 // Hash with parallel method
1424 try KT128.hashParallel(input, &output_par, .{ .customization = customization }, allocator, io);
1425
1426 // Verify outputs match
1427 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1428}
1429
1430test "KT256 sequential and parallel produce same output for small inputs" {
1431 const allocator = std.testing.allocator;
1432 const io = std.testing.io;
1433
1434 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1435 const random = prng.random();
1436
1437 // Test with different small input sizes
1438 const test_sizes = [_]usize{ 100, 1024, 4096, 8192 }; // 100B, 1KB, 4KB, 8KB
1439
1440 for (test_sizes) |size| {
1441 const input = try allocator.alloc(u8, size);
1442 defer allocator.free(input);
1443
1444 // Fill with random data
1445 random.bytes(input);
1446
1447 var output_seq: [64]u8 = undefined;
1448 var output_par: [64]u8 = undefined;
1449
1450 // Hash with sequential method
1451 try KT256.hash(input, &output_seq, .{});
1452
1453 // Hash with parallel method
1454 try KT256.hashParallel(input, &output_par, .{}, allocator, io);
1455
1456 // Verify outputs match
1457 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1458 }
1459}
1460
1461test "KT256 sequential and parallel produce same output for large inputs" {
1462 const allocator = std.testing.allocator;
1463 const io = std.testing.io;
1464
1465 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1466 const random = prng.random();
1467
1468 // Test with input sizes above the 2MB threshold to trigger parallel processing.
1469 // Include a size with partial final leaf to stress boundary handling.
1470 const test_sizes = [_]usize{
1471 5 * 512 * 1024, // 2.5 MB
1472 5 * 512 * 1024 + 8191, // 2.5 MB + 8191B (partial leaf)
1473 };
1474
1475 for (test_sizes) |size| {
1476 const input = try allocator.alloc(u8, size);
1477 defer allocator.free(input);
1478
1479 // Fill with random data
1480 random.bytes(input);
1481
1482 var output_seq: [64]u8 = undefined;
1483 var output_par: [64]u8 = undefined;
1484
1485 // Hash with sequential method
1486 try KT256.hash(input, &output_seq, .{});
1487
1488 // Hash with parallel method
1489 try KT256.hashParallel(input, &output_par, .{}, allocator, io);
1490
1491 // Verify outputs match
1492 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1493 }
1494}
1495
1496test "KT256 sequential and parallel produce same output with customization" {
1497 const allocator = std.testing.allocator;
1498 const io = std.testing.io;
1499
1500 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1501 const random = prng.random();
1502
1503 const input_size = 5 * 512 * 1024; // 2.5MB
1504 const input = try allocator.alloc(u8, input_size);
1505 defer allocator.free(input);
1506
1507 // Fill with random data
1508 random.bytes(input);
1509
1510 const customization = "test domain";
1511 var output_seq: [80]u8 = undefined;
1512 var output_par: [80]u8 = undefined;
1513
1514 // Hash with sequential method
1515 try KT256.hash(input, &output_seq, .{ .customization = customization });
1516
1517 // Hash with parallel method
1518 try KT256.hashParallel(input, &output_par, .{ .customization = customization }, allocator, io);
1519
1520 // Verify outputs match
1521 try std.testing.expectEqualSlices(u8, &output_seq, &output_par);
1522}
1523
1524/// Helper: Generate pattern data where data[i] = (i % 251)
1525fn generatePattern(allocator: Allocator, len: usize) ![]u8 {
1526 const data = try allocator.alloc(u8, len);
1527 for (data, 0..) |*byte, i| {
1528 byte.* = @intCast(i % 251);
1529 }
1530 return data;
1531}
1532
1533test "KT128: empty message, empty customization, 32 bytes" {
1534 var output: [32]u8 = undefined;
1535 try KT128.hash(&[_]u8{}, &output, .{});
1536
1537 var expected: [32]u8 = undefined;
1538 _ = try std.fmt.hexToBytes(&expected, "1AC2D450FC3B4205D19DA7BFCA1B37513C0803577AC7167F06FE2CE1F0EF39E5");
1539 try std.testing.expectEqualSlices(u8, &expected, &output);
1540}
1541
1542test "KT128: empty message, empty customization, 64 bytes" {
1543 var output: [64]u8 = undefined;
1544 try KT128.hash(&[_]u8{}, &output, .{});
1545
1546 var expected: [64]u8 = undefined;
1547 _ = try std.fmt.hexToBytes(&expected, "1AC2D450FC3B4205D19DA7BFCA1B37513C0803577AC7167F06FE2CE1F0EF39E54269C056B8C82E48276038B6D292966CC07A3D4645272E31FF38508139EB0A71");
1548 try std.testing.expectEqualSlices(u8, &expected, &output);
1549}
1550
1551test "KT128: empty message, empty customization, 10032 bytes (last 32)" {
1552 const allocator = std.testing.allocator;
1553 const output = try allocator.alloc(u8, 10032);
1554 defer allocator.free(output);
1555
1556 try KT128.hash(&[_]u8{}, output, .{});
1557
1558 var expected: [32]u8 = undefined;
1559 _ = try std.fmt.hexToBytes(&expected, "E8DC563642F7228C84684C898405D3A834799158C079B12880277A1D28E2FF6D");
1560 try std.testing.expectEqualSlices(u8, &expected, output[10000..]);
1561}
1562
1563test "KT128: pattern message (1 byte), empty customization, 32 bytes" {
1564 const allocator = std.testing.allocator;
1565 const message = try generatePattern(allocator, 1);
1566 defer allocator.free(message);
1567
1568 var output: [32]u8 = undefined;
1569 try KT128.hash(message, &output, .{});
1570
1571 var expected: [32]u8 = undefined;
1572 _ = try std.fmt.hexToBytes(&expected, "2BDA92450E8B147F8A7CB629E784A058EFCA7CF7D8218E02D345DFAA65244A1F");
1573 try std.testing.expectEqualSlices(u8, &expected, &output);
1574}
1575
1576test "KT128: pattern message (17 bytes), empty customization, 32 bytes" {
1577 const allocator = std.testing.allocator;
1578 const message = try generatePattern(allocator, 17);
1579 defer allocator.free(message);
1580
1581 var output: [32]u8 = undefined;
1582 try KT128.hash(message, &output, .{});
1583
1584 var expected: [32]u8 = undefined;
1585 _ = try std.fmt.hexToBytes(&expected, "6BF75FA2239198DB4772E36478F8E19B0F371205F6A9A93A273F51DF37122888");
1586 try std.testing.expectEqualSlices(u8, &expected, &output);
1587}
1588
1589test "KT128: pattern message (289 bytes), empty customization, 32 bytes" {
1590 const allocator = std.testing.allocator;
1591 const message = try generatePattern(allocator, 289);
1592 defer allocator.free(message);
1593
1594 var output: [32]u8 = undefined;
1595 try KT128.hash(message, &output, .{});
1596
1597 var expected: [32]u8 = undefined;
1598 _ = try std.fmt.hexToBytes(&expected, "0C315EBCDEDBF61426DE7DCF8FB725D1E74675D7F5327A5067F367B108ECB67C");
1599 try std.testing.expectEqualSlices(u8, &expected, &output);
1600}
1601
1602test "KT128: 0xFF message (1 byte), pattern customization (1 byte), 32 bytes" {
1603 const allocator = std.testing.allocator;
1604 const customization = try generatePattern(allocator, 1);
1605 defer allocator.free(customization);
1606
1607 const message = [_]u8{0xFF};
1608 var output: [32]u8 = undefined;
1609 try KT128.hash(&message, &output, .{ .customization = customization });
1610
1611 var expected: [32]u8 = undefined;
1612 _ = try std.fmt.hexToBytes(&expected, "A20B92B251E3D62443EC286E4B9B470A4E8315C156EEB24878B038ABE20650BE");
1613 try std.testing.expectEqualSlices(u8, &expected, &output);
1614}
1615
1616test "KT128: pattern message (8191 bytes), empty customization, 32 bytes" {
1617 const allocator = std.testing.allocator;
1618 const message = try generatePattern(allocator, 8191);
1619 defer allocator.free(message);
1620
1621 var output: [32]u8 = undefined;
1622 try KT128.hash(message, &output, .{});
1623
1624 var expected: [32]u8 = undefined;
1625 _ = try std.fmt.hexToBytes(&expected, "1B577636F723643E990CC7D6A659837436FD6A103626600EB8301CD1DBE553D6");
1626 try std.testing.expectEqualSlices(u8, &expected, &output);
1627}
1628
1629test "KT128: pattern message (8192 bytes), empty customization, 32 bytes" {
1630 const allocator = std.testing.allocator;
1631 const message = try generatePattern(allocator, 8192);
1632 defer allocator.free(message);
1633
1634 var output: [32]u8 = undefined;
1635 try KT128.hash(message, &output, .{});
1636
1637 var expected: [32]u8 = undefined;
1638 _ = try std.fmt.hexToBytes(&expected, "48F256F6772F9EDFB6A8B661EC92DC93B95EBD05A08A17B39AE3490870C926C3");
1639 try std.testing.expectEqualSlices(u8, &expected, &output);
1640}
1641
1642test "KT256: empty message, empty customization, 64 bytes" {
1643 var output: [64]u8 = undefined;
1644 try KT256.hash(&[_]u8{}, &output, .{});
1645
1646 var expected: [64]u8 = undefined;
1647 _ = try std.fmt.hexToBytes(&expected, "B23D2E9CEA9F4904E02BEC06817FC10CE38CE8E93EF4C89E6537076AF8646404E3E8B68107B8833A5D30490AA33482353FD4ADC7148ECB782855003AAEBDE4A9");
1648 try std.testing.expectEqualSlices(u8, &expected, &output);
1649}
1650
1651test "KT256: empty message, empty customization, 128 bytes" {
1652 var output: [128]u8 = undefined;
1653 try KT256.hash(&[_]u8{}, &output, .{});
1654
1655 var expected: [128]u8 = undefined;
1656 _ = try std.fmt.hexToBytes(&expected, "B23D2E9CEA9F4904E02BEC06817FC10CE38CE8E93EF4C89E6537076AF8646404E3E8B68107B8833A5D30490AA33482353FD4ADC7148ECB782855003AAEBDE4A9B0925319D8EA1E121A609821EC19EFEA89E6D08DAEE1662B69C840289F188BA860F55760B61F82114C030C97E5178449608CCD2CD2D919FC7829FF69931AC4D0");
1657 try std.testing.expectEqualSlices(u8, &expected, &output);
1658}
1659
1660test "KT256: pattern message (1 byte), empty customization, 64 bytes" {
1661 const allocator = std.testing.allocator;
1662 const message = try generatePattern(allocator, 1);
1663 defer allocator.free(message);
1664
1665 var output: [64]u8 = undefined;
1666 try KT256.hash(message, &output, .{});
1667
1668 var expected: [64]u8 = undefined;
1669 _ = try std.fmt.hexToBytes(&expected, "0D005A194085360217128CF17F91E1F71314EFA5564539D444912E3437EFA17F82DB6F6FFE76E781EAA068BCE01F2BBF81EACB983D7230F2FB02834A21B1DDD0");
1670 try std.testing.expectEqualSlices(u8, &expected, &output);
1671}
1672
1673test "KT256: pattern message (17 bytes), empty customization, 64 bytes" {
1674 const allocator = std.testing.allocator;
1675 const message = try generatePattern(allocator, 17);
1676 defer allocator.free(message);
1677
1678 var output: [64]u8 = undefined;
1679 try KT256.hash(message, &output, .{});
1680
1681 var expected: [64]u8 = undefined;
1682 _ = try std.fmt.hexToBytes(&expected, "1BA3C02B1FC514474F06C8979978A9056C8483F4A1B63D0DCCEFE3A28A2F323E1CDCCA40EBF006AC76EF0397152346837B1277D3E7FAA9C9653B19075098527B");
1683 try std.testing.expectEqualSlices(u8, &expected, &output);
1684}
1685
1686test "KT256: pattern message (8191 bytes), empty customization, 64 bytes" {
1687 const allocator = std.testing.allocator;
1688 const message = try generatePattern(allocator, 8191);
1689 defer allocator.free(message);
1690
1691 var output: [64]u8 = undefined;
1692 try KT256.hash(message, &output, .{});
1693
1694 var expected: [64]u8 = undefined;
1695 _ = try std.fmt.hexToBytes(&expected, "3081434D93A4108D8D8A3305B89682CEBEDC7CA4EA8A3CE869FBB73CBE4A58EEF6F24DE38FFC170514C70E7AB2D01F03812616E863D769AFB3753193BA045B20");
1696 try std.testing.expectEqualSlices(u8, &expected, &output);
1697}
1698
1699test "KT256: pattern message (8192 bytes), empty customization, 64 bytes" {
1700 const allocator = std.testing.allocator;
1701 const message = try generatePattern(allocator, 8192);
1702 defer allocator.free(message);
1703
1704 var output: [64]u8 = undefined;
1705 try KT256.hash(message, &output, .{});
1706
1707 var expected: [64]u8 = undefined;
1708 _ = try std.fmt.hexToBytes(&expected, "C6EE8E2AD3200C018AC87AAA031CDAC22121B412D07DC6E0DCCBB53423747E9A1C18834D99DF596CF0CF4B8DFAFB7BF02D139D0C9035725ADC1A01B7230A41FA");
1709 try std.testing.expectEqualSlices(u8, &expected, &output);
1710}
1711
1712test "KT128: pattern message (8193 bytes), empty customization, 32 bytes" {
1713 const allocator = std.testing.allocator;
1714 const message = try generatePattern(allocator, 8193);
1715 defer allocator.free(message);
1716
1717 var output: [32]u8 = undefined;
1718 try KT128.hash(message, &output, .{});
1719
1720 var expected: [32]u8 = undefined;
1721 _ = try std.fmt.hexToBytes(&expected, "BB66FE72EAEA5179418D5295EE1344854D8AD7F3FA17EFCB467EC152341284CF");
1722 try std.testing.expectEqualSlices(u8, &expected, &output);
1723}
1724
1725test "KT128: pattern message (16384 bytes), empty customization, 32 bytes" {
1726 const allocator = std.testing.allocator;
1727 const message = try generatePattern(allocator, 16384);
1728 defer allocator.free(message);
1729
1730 var output: [32]u8 = undefined;
1731 try KT128.hash(message, &output, .{});
1732
1733 var expected: [32]u8 = undefined;
1734 _ = try std.fmt.hexToBytes(&expected, "82778F7F7234C83352E76837B721FBDBB5270B88010D84FA5AB0B61EC8CE0956");
1735 try std.testing.expectEqualSlices(u8, &expected, &output);
1736}
1737
1738test "KT128: pattern message (16385 bytes), empty customization, 32 bytes" {
1739 const allocator = std.testing.allocator;
1740 const message = try generatePattern(allocator, 16385);
1741 defer allocator.free(message);
1742
1743 var output: [32]u8 = undefined;
1744 try KT128.hash(message, &output, .{});
1745
1746 var expected: [32]u8 = undefined;
1747 _ = try std.fmt.hexToBytes(&expected, "5F8D2B943922B451842B4E82740D02369E2D5F9F33C5123509A53B955FE177B2");
1748 try std.testing.expectEqualSlices(u8, &expected, &output);
1749}
1750
1751test "KT256: pattern message (8193 bytes), empty customization, 64 bytes" {
1752 const allocator = std.testing.allocator;
1753 const message = try generatePattern(allocator, 8193);
1754 defer allocator.free(message);
1755
1756 var output: [64]u8 = undefined;
1757 try KT256.hash(message, &output, .{});
1758
1759 var expected: [64]u8 = undefined;
1760 _ = try std.fmt.hexToBytes(&expected, "65FF03335900E5197ACBD5F41B797F0E7E36AD4FF7D89C09FA6F28AE58D1E8BC2DF1779B86F988C3B13690172914EA172423B23EF4057255BB0836AB3A99836E");
1761 try std.testing.expectEqualSlices(u8, &expected, &output);
1762}
1763
1764test "KT256: pattern message (16384 bytes), empty customization, 64 bytes" {
1765 const allocator = std.testing.allocator;
1766 const message = try generatePattern(allocator, 16384);
1767 defer allocator.free(message);
1768
1769 var output: [64]u8 = undefined;
1770 try KT256.hash(message, &output, .{});
1771
1772 var expected: [64]u8 = undefined;
1773 _ = try std.fmt.hexToBytes(&expected, "74604239A14847CB79069B4FF0E51070A93034C9AC4DFF4D45E0F2C5DA81D930DE6055C2134B4DF4E49F27D1B2C66E95491858B182A924BD0504DA5976BC516D");
1774 try std.testing.expectEqualSlices(u8, &expected, &output);
1775}
1776
1777test "KT256: pattern message (16385 bytes), empty customization, 64 bytes" {
1778 const allocator = std.testing.allocator;
1779 const message = try generatePattern(allocator, 16385);
1780 defer allocator.free(message);
1781
1782 var output: [64]u8 = undefined;
1783 try KT256.hash(message, &output, .{});
1784
1785 var expected: [64]u8 = undefined;
1786 _ = try std.fmt.hexToBytes(&expected, "C814F23132DADBFD55379F18CB988CB39B751F119322823FD982644A897485397B9F40EB11C6E416359B8AE695A5CE0FA79D1ADA1EEC745D82E0A5AB08A9F014");
1787 try std.testing.expectEqualSlices(u8, &expected, &output);
1788}
1789
1790test "KT128 incremental: empty message matches one-shot" {
1791 var output_oneshot: [32]u8 = undefined;
1792 var output_incremental: [32]u8 = undefined;
1793
1794 try KT128.hash(&[_]u8{}, &output_oneshot, .{});
1795
1796 var hasher = KT128.init(.{});
1797 hasher.final(&output_incremental);
1798
1799 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1800}
1801
1802test "KT128 incremental: small message matches one-shot" {
1803 const message = "Hello, KangarooTwelve!";
1804
1805 var output_oneshot: [32]u8 = undefined;
1806 var output_incremental: [32]u8 = undefined;
1807
1808 try KT128.hash(message, &output_oneshot, .{});
1809
1810 var hasher = KT128.init(.{});
1811 hasher.update(message);
1812 hasher.final(&output_incremental);
1813
1814 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1815}
1816
1817test "KT128 incremental: multiple updates match single update" {
1818 const part1 = "Hello, ";
1819 const part2 = "Kangaroo";
1820 const part3 = "Twelve!";
1821
1822 var output_single: [32]u8 = undefined;
1823 var output_multi: [32]u8 = undefined;
1824
1825 // Single update
1826 var hasher1 = KT128.init(.{});
1827 hasher1.update(part1 ++ part2 ++ part3);
1828 hasher1.final(&output_single);
1829
1830 // Multiple updates
1831 var hasher2 = KT128.init(.{});
1832 hasher2.update(part1);
1833 hasher2.update(part2);
1834 hasher2.update(part3);
1835 hasher2.final(&output_multi);
1836
1837 try std.testing.expectEqualSlices(u8, &output_single, &output_multi);
1838}
1839
1840test "KT128 incremental: exactly chunk_size matches one-shot" {
1841 const allocator = std.testing.allocator;
1842 const message = try allocator.alloc(u8, 8192);
1843 defer allocator.free(message);
1844 @memset(message, 0xAB);
1845
1846 var output_oneshot: [32]u8 = undefined;
1847 var output_incremental: [32]u8 = undefined;
1848
1849 try KT128.hash(message, &output_oneshot, .{});
1850
1851 var hasher = KT128.init(.{});
1852 hasher.update(message);
1853 hasher.final(&output_incremental);
1854
1855 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1856}
1857
1858test "KT128 incremental: larger than chunk_size matches one-shot" {
1859 const allocator = std.testing.allocator;
1860 const message = try generatePattern(allocator, 16384);
1861 defer allocator.free(message);
1862
1863 var output_oneshot: [32]u8 = undefined;
1864 var output_incremental: [32]u8 = undefined;
1865
1866 try KT128.hash(message, &output_oneshot, .{});
1867
1868 var hasher = KT128.init(.{});
1869 hasher.update(message);
1870 hasher.final(&output_incremental);
1871
1872 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1873}
1874
1875test "KT128 incremental: with customization matches one-shot" {
1876 const message = "Test message";
1877 const customization = "my custom domain";
1878
1879 var output_oneshot: [32]u8 = undefined;
1880 var output_incremental: [32]u8 = undefined;
1881
1882 try KT128.hash(message, &output_oneshot, .{ .customization = customization });
1883
1884 var hasher = KT128.init(.{ .customization = customization });
1885 hasher.update(message);
1886 hasher.final(&output_incremental);
1887
1888 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1889}
1890
1891test "KT128 incremental: large message with customization" {
1892 const allocator = std.testing.allocator;
1893 const message = try generatePattern(allocator, 20000);
1894 defer allocator.free(message);
1895 const customization = "test domain";
1896
1897 var output_oneshot: [48]u8 = undefined;
1898 var output_incremental: [48]u8 = undefined;
1899
1900 try KT128.hash(message, &output_oneshot, .{ .customization = customization });
1901
1902 var hasher = KT128.init(.{ .customization = customization });
1903 hasher.update(message);
1904 hasher.final(&output_incremental);
1905
1906 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1907}
1908
1909test "KT128 incremental: streaming chunks matches one-shot" {
1910 const allocator = std.testing.allocator;
1911 const message = try generatePattern(allocator, 25000);
1912 defer allocator.free(message);
1913
1914 var output_oneshot: [32]u8 = undefined;
1915 var output_incremental: [32]u8 = undefined;
1916
1917 try KT128.hash(message, &output_oneshot, .{});
1918
1919 var hasher = KT128.init(.{});
1920
1921 // Feed in 1KB chunks
1922 var offset: usize = 0;
1923 while (offset < message.len) {
1924 const chunk_size_local = @min(1024, message.len - offset);
1925 hasher.update(message[offset..][0..chunk_size_local]);
1926 offset += chunk_size_local;
1927 }
1928 hasher.final(&output_incremental);
1929
1930 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1931}
1932
1933test "KT256 incremental: empty message matches one-shot" {
1934 var output_oneshot: [64]u8 = undefined;
1935 var output_incremental: [64]u8 = undefined;
1936
1937 try KT256.hash(&[_]u8{}, &output_oneshot, .{});
1938
1939 var hasher = KT256.init(.{});
1940 hasher.final(&output_incremental);
1941
1942 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1943}
1944
1945test "KT256 incremental: small message matches one-shot" {
1946 const message = "Hello, KangarooTwelve with 256-bit security!";
1947
1948 var output_oneshot: [64]u8 = undefined;
1949 var output_incremental: [64]u8 = undefined;
1950
1951 try KT256.hash(message, &output_oneshot, .{});
1952
1953 var hasher = KT256.init(.{});
1954 hasher.update(message);
1955 hasher.final(&output_incremental);
1956
1957 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1958}
1959
1960test "KT256 incremental: large message matches one-shot" {
1961 const allocator = std.testing.allocator;
1962 const message = try generatePattern(allocator, 30000);
1963 defer allocator.free(message);
1964
1965 var output_oneshot: [64]u8 = undefined;
1966 var output_incremental: [64]u8 = undefined;
1967
1968 try KT256.hash(message, &output_oneshot, .{});
1969
1970 var hasher = KT256.init(.{});
1971 hasher.update(message);
1972 hasher.final(&output_incremental);
1973
1974 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1975}
1976
1977test "KT256 incremental: with customization matches one-shot" {
1978 const allocator = std.testing.allocator;
1979 const message = try generatePattern(allocator, 15000);
1980 defer allocator.free(message);
1981 const customization = "KT256 custom domain";
1982
1983 var output_oneshot: [80]u8 = undefined;
1984 var output_incremental: [80]u8 = undefined;
1985
1986 try KT256.hash(message, &output_oneshot, .{ .customization = customization });
1987
1988 var hasher = KT256.init(.{ .customization = customization });
1989 hasher.update(message);
1990 hasher.final(&output_incremental);
1991
1992 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
1993}
1994
1995test "KT128 incremental: random small message with random chunk sizes" {
1996 const allocator = std.testing.allocator;
1997
1998 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
1999 const random = prng.random();
2000
2001 const test_sizes = [_]usize{ 100, 500, 2000, 5000, 10000 };
2002
2003 for (test_sizes) |total_size| {
2004 const message = try allocator.alloc(u8, total_size);
2005 defer allocator.free(message);
2006 random.bytes(message);
2007
2008 var output_oneshot: [32]u8 = undefined;
2009 var output_incremental: [32]u8 = undefined;
2010
2011 try KT128.hash(message, &output_oneshot, .{});
2012
2013 var hasher = KT128.init(.{});
2014 var offset: usize = 0;
2015
2016 while (offset < message.len) {
2017 const remaining = message.len - offset;
2018 const max_chunk = @min(1000, remaining);
2019 const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2020
2021 hasher.update(message[offset..][0..chunk_size_local]);
2022 offset += chunk_size_local;
2023 }
2024 hasher.final(&output_incremental);
2025
2026 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2027 }
2028}
2029
2030test "KT128 incremental: random large message (1MB) with random chunk sizes" {
2031 const allocator = std.testing.allocator;
2032
2033 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2034 const random = prng.random();
2035
2036 const total_size: usize = 1024 * 1024; // 1 MB
2037 const message = try allocator.alloc(u8, total_size);
2038 defer allocator.free(message);
2039 random.bytes(message);
2040
2041 var output_oneshot: [32]u8 = undefined;
2042 var output_incremental: [32]u8 = undefined;
2043
2044 try KT128.hash(message, &output_oneshot, .{});
2045
2046 var hasher = KT128.init(.{});
2047 var offset: usize = 0;
2048
2049 while (offset < message.len) {
2050 const remaining = message.len - offset;
2051 const max_chunk = @min(10000, remaining);
2052 const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2053
2054 hasher.update(message[offset..][0..chunk_size_local]);
2055 offset += chunk_size_local;
2056 }
2057 hasher.final(&output_incremental);
2058
2059 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2060}
2061
2062test "KT256 incremental: random small message with random chunk sizes" {
2063 const allocator = std.testing.allocator;
2064
2065 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2066 const random = prng.random();
2067
2068 const test_sizes = [_]usize{ 100, 500, 2000, 5000, 10000 };
2069
2070 for (test_sizes) |total_size| {
2071 // Generate random message
2072 const message = try allocator.alloc(u8, total_size);
2073 defer allocator.free(message);
2074 random.bytes(message);
2075
2076 var output_oneshot: [64]u8 = undefined;
2077 var output_incremental: [64]u8 = undefined;
2078
2079 try KT256.hash(message, &output_oneshot, .{});
2080
2081 var hasher = KT256.init(.{});
2082 var offset: usize = 0;
2083
2084 while (offset < message.len) {
2085 const remaining = message.len - offset;
2086 const max_chunk = @min(1000, remaining);
2087 const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2088
2089 hasher.update(message[offset..][0..chunk_size_local]);
2090 offset += chunk_size_local;
2091 }
2092 hasher.final(&output_incremental);
2093
2094 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2095 }
2096}
2097
2098test "KT256 incremental: random large message (1MB) with random chunk sizes" {
2099 const allocator = std.testing.allocator;
2100
2101 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2102 const random = prng.random();
2103
2104 const total_size: usize = 1024 * 1024; // 1 MB
2105 const message = try allocator.alloc(u8, total_size);
2106 defer allocator.free(message);
2107 random.bytes(message);
2108
2109 var output_oneshot: [64]u8 = undefined;
2110 var output_incremental: [64]u8 = undefined;
2111
2112 try KT256.hash(message, &output_oneshot, .{});
2113
2114 var hasher = KT256.init(.{});
2115 var offset: usize = 0;
2116
2117 while (offset < message.len) {
2118 const remaining = message.len - offset;
2119 const max_chunk = @min(10000, remaining);
2120 const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2121
2122 hasher.update(message[offset..][0..chunk_size_local]);
2123 offset += chunk_size_local;
2124 }
2125 hasher.final(&output_incremental);
2126
2127 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2128}
2129
2130test "KT128 incremental: random message with customization and random chunks" {
2131 const allocator = std.testing.allocator;
2132
2133 var prng = std.Random.DefaultPrng.init(std.testing.random_seed);
2134 const random = prng.random();
2135
2136 const total_size: usize = 50000;
2137 const message = try allocator.alloc(u8, total_size);
2138 defer allocator.free(message);
2139 random.bytes(message);
2140
2141 const customization = "random test domain";
2142
2143 var output_oneshot: [48]u8 = undefined;
2144 var output_incremental: [48]u8 = undefined;
2145
2146 try KT128.hash(message, &output_oneshot, .{ .customization = customization });
2147
2148 var hasher = KT128.init(.{ .customization = customization });
2149 var offset: usize = 0;
2150
2151 while (offset < message.len) {
2152 const remaining = message.len - offset;
2153 const max_chunk = @min(5000, remaining);
2154 const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk);
2155
2156 hasher.update(message[offset..][0..chunk_size_local]);
2157 offset += chunk_size_local;
2158 }
2159 hasher.final(&output_incremental);
2160
2161 try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental);
2162}