master
  1//! SIMD (Single Instruction; Multiple Data) convenience functions.
  2//!
  3//! May offer a potential boost in performance on some targets by performing
  4//! the same operation on multiple elements at once.
  5//!
  6//! Some functions are known to not work on MIPS.
  7
  8const std = @import("std");
  9const builtin = @import("builtin");
 10
 11pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int {
 12    @setEvalBranchQuota(2_000);
 13
 14    // This is guesswork, if you have better suggestions can add it or edit the current here
 15    const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable);
 16    const vector_bit_size: u16 = blk: {
 17        if (cpu.arch.isX86()) {
 18            if (T == bool and cpu.has(.x86, .prefer_mask_registers)) return 64;
 19            if (builtin.zig_backend != .stage2_x86_64 and cpu.has(.x86, .avx512f) and !cpu.hasAny(.x86, &.{ .prefer_256_bit, .prefer_128_bit })) break :blk 512;
 20            if (cpu.hasAny(.x86, &.{ .prefer_256_bit, .avx2 }) and !cpu.has(.x86, .prefer_128_bit)) break :blk 256;
 21            if (cpu.has(.x86, .sse)) break :blk 128;
 22            if (cpu.hasAny(.x86, &.{ .mmx, .@"3dnow" })) break :blk 64;
 23        } else if (cpu.arch.isArm()) {
 24            if (cpu.has(.arm, .neon)) break :blk 128;
 25        } else if (cpu.arch.isAARCH64()) {
 26            // NVIDIA Grace supports 128-bit SVE
 27            // AWS Graviton3 supports 256-bit SVE
 28            // Fujitsu A64FX supports 512-bit SVE
 29            // -> 256-bit seems like a good default for now.
 30            if (cpu.has(.aarch64, .sve)) break :blk 256;
 31            if (cpu.has(.aarch64, .neon)) break :blk 128;
 32        } else if (cpu.arch == .hexagon) {
 33            if (cpu.has(.hexagon, .hvx_length64b)) break :blk 512;
 34            if (cpu.has(.hexagon, .hvx)) break :blk 1024;
 35        } else if (cpu.arch.isLoongArch()) {
 36            if (cpu.has(.loongarch, .lasx)) break :blk 256;
 37            if (cpu.has(.loongarch, .lsx)) break :blk 128;
 38        } else if (cpu.arch.isMIPS()) {
 39            if (cpu.has(.mips, .msa)) break :blk 128;
 40            if (cpu.has(.mips, .mips3d)) break :blk 64;
 41        } else if (cpu.arch.isPowerPC()) {
 42            if (cpu.has(.powerpc, .vsx)) break :blk 128;
 43            if (cpu.has(.powerpc, .altivec)) break :blk 128;
 44        } else if (cpu.arch.isRISCV()) {
 45            // In RISC-V Vector Registers are length agnostic so there's no good way to determine the best size.
 46            // The usual vector length in most RISC-V cpus is 256 bits, however it can get to multiple kB.
 47            if (cpu.has(.riscv, .v)) {
 48                inline for (.{
 49                    .{ .zvl65536b, 65536 },
 50                    .{ .zvl32768b, 32768 },
 51                    .{ .zvl16384b, 16384 },
 52                    .{ .zvl8192b, 8192 },
 53                    .{ .zvl4096b, 4096 },
 54                    .{ .zvl2048b, 2048 },
 55                    .{ .zvl1024b, 1024 },
 56                    .{ .zvl512b, 512 },
 57                    .{ .zvl256b, 256 },
 58                    .{ .zvl128b, 128 },
 59                    .{ .zvl64b, 64 },
 60                    .{ .zvl32b, 32 },
 61                }) |mapping| {
 62                    if (cpu.has(.riscv, mapping[0])) break :blk mapping[1];
 63                }
 64
 65                break :blk 256;
 66            }
 67        } else if (cpu.arch == .s390x) {
 68            if (cpu.has(.s390x, .vector)) break :blk 128;
 69        } else if (cpu.arch.isSPARC()) {
 70            if (cpu.hasAny(.sparc, &.{ .vis, .vis2, .vis3 })) break :blk 64;
 71        } else if (cpu.arch == .kvx) {
 72            break :blk 1024;
 73        } else if (cpu.arch == .ve) {
 74            if (cpu.has(.ve, .vpu)) break :blk 2048;
 75        } else if (cpu.arch.isWasm()) {
 76            if (cpu.has(.wasm, .simd128)) break :blk 128;
 77        }
 78        return null;
 79    };
 80    if (vector_bit_size <= element_bit_size) return null;
 81
 82    return @divExact(vector_bit_size, element_bit_size);
 83}
 84
 85/// Suggests a target-dependant vector length for a given type, or null if scalars are recommended.
 86/// Not yet implemented for every CPU architecture.
 87pub fn suggestVectorLength(comptime T: type) ?comptime_int {
 88    return suggestVectorLengthForCpu(T, builtin.cpu);
 89}
 90
 91test "suggestVectorLengthForCpu works with signed and unsigned values" {
 92    comptime var cpu = std.Target.Cpu.baseline(std.Target.Cpu.Arch.x86_64, builtin.os);
 93    comptime cpu.features.addFeature(@intFromEnum(std.Target.x86.Feature.avx512f));
 94    comptime cpu.features.populateDependencies(&std.Target.x86.all_features);
 95    const expected_len: usize = switch (builtin.zig_backend) {
 96        .stage2_x86_64 => 8,
 97        else => 16,
 98    };
 99    const signed_integer_len = suggestVectorLengthForCpu(i32, cpu).?;
100    const unsigned_integer_len = suggestVectorLengthForCpu(u32, cpu).?;
101    try std.testing.expectEqual(expected_len, unsigned_integer_len);
102    try std.testing.expectEqual(expected_len, signed_integer_len);
103}
104
105fn vectorLength(comptime VectorType: type) comptime_int {
106    return switch (@typeInfo(VectorType)) {
107        .vector => |info| info.len,
108        .array => |info| info.len,
109        else => @compileError("Invalid type " ++ @typeName(VectorType)),
110    };
111}
112
113/// Returns the smallest type of unsigned ints capable of indexing any element within the given vector type.
114pub fn VectorIndex(comptime VectorType: type) type {
115    return std.math.IntFittingRange(0, vectorLength(VectorType) - 1);
116}
117
118/// Returns the smallest type of unsigned ints capable of holding the length of the given vector type.
119pub fn VectorCount(comptime VectorType: type) type {
120    return std.math.IntFittingRange(0, vectorLength(VectorType));
121}
122
123/// Returns a vector containing the first `len` integers in order from 0 to `len`-1.
124/// For example, `iota(i32, 8)` will return a vector containing `.{0, 1, 2, 3, 4, 5, 6, 7}`.
125pub inline fn iota(comptime T: type, comptime len: usize) @Vector(len, T) {
126    comptime {
127        var out: [len]T = undefined;
128        for (&out, 0..) |*element, i| {
129            element.* = switch (@typeInfo(T)) {
130                .int => @as(T, @intCast(i)),
131                .float => @as(T, @floatFromInt(i)),
132                else => @compileError("Can't use type " ++ @typeName(T) ++ " in iota."),
133            };
134        }
135        return @as(@Vector(len, T), out);
136    }
137}
138
139/// Returns a vector containing the same elements as the input, but repeated until the desired length is reached.
140/// For example, `repeat(8, [_]u32{1, 2, 3})` will return a vector containing `.{1, 2, 3, 1, 2, 3, 1, 2}`.
141pub fn repeat(comptime len: usize, vec: anytype) @Vector(len, std.meta.Child(@TypeOf(vec))) {
142    const Child = std.meta.Child(@TypeOf(vec));
143
144    return @shuffle(Child, vec, undefined, iota(i32, len) % @as(@Vector(len, i32), @splat(@intCast(vectorLength(@TypeOf(vec))))));
145}
146
147/// Returns a vector containing all elements of the first vector at the lower indices followed by all elements of the second vector
148/// at the higher indices.
149pub fn join(a: anytype, b: anytype) @Vector(vectorLength(@TypeOf(a)) + vectorLength(@TypeOf(b)), std.meta.Child(@TypeOf(a))) {
150    const Child = std.meta.Child(@TypeOf(a));
151    const a_len = vectorLength(@TypeOf(a));
152    const b_len = vectorLength(@TypeOf(b));
153
154    return @shuffle(Child, a, b, @as([a_len]i32, iota(i32, a_len)) ++ @as([b_len]i32, ~iota(i32, b_len)));
155}
156
157/// Returns a vector whose elements alternates between those of each input vector.
158/// For example, `interlace(.{[4]u32{11, 12, 13, 14}, [4]u32{21, 22, 23, 24}})` returns a vector containing `.{11, 21, 12, 22, 13, 23, 14, 24}`.
159pub fn interlace(vecs: anytype) @Vector(vectorLength(@TypeOf(vecs[0])) * vecs.len, std.meta.Child(@TypeOf(vecs[0]))) {
160    // interlace doesn't work on MIPS, for some reason.
161    // Notes from earlier debug attempt:
162    //  The indices are correct. The problem seems to be with the @shuffle builtin.
163    //  On MIPS, the test that interlaces small_base gives { 0, 2, 0, 0, 64, 255, 248, 200, 0, 0 }.
164    //  Calling this with two inputs seems to work fine, but I'll let the compile error trigger for all inputs, just to be safe.
165    if (builtin.cpu.arch.isMIPS()) @compileError("TODO: Find out why interlace() doesn't work on MIPS");
166
167    const VecType = @TypeOf(vecs[0]);
168    const vecs_arr = @as([vecs.len]VecType, vecs);
169    const Child = std.meta.Child(@TypeOf(vecs_arr[0]));
170
171    if (vecs_arr.len == 1) return vecs_arr[0];
172
173    const a_vec_count = (1 + vecs_arr.len) >> 1;
174    const b_vec_count = vecs_arr.len >> 1;
175
176    const a = interlace(@as(*const [a_vec_count]VecType, @ptrCast(vecs_arr[0..a_vec_count])).*);
177    const b = interlace(@as(*const [b_vec_count]VecType, @ptrCast(vecs_arr[a_vec_count..])).*);
178
179    const a_len = vectorLength(@TypeOf(a));
180    const b_len = vectorLength(@TypeOf(b));
181    const len = a_len + b_len;
182
183    const indices = comptime blk: {
184        const Vi32 = @Vector(len, i32);
185        const count_up = iota(i32, len);
186        const cycle = @divFloor(count_up, @as(Vi32, @splat(@intCast(vecs_arr.len))));
187        const select_mask = repeat(len, join(@as(@Vector(a_vec_count, bool), @splat(true)), @as(@Vector(b_vec_count, bool), @splat(false))));
188        const a_indices = count_up - cycle * @as(Vi32, @splat(@intCast(b_vec_count)));
189        const b_indices = shiftElementsRight(count_up - cycle * @as(Vi32, @splat(@intCast(a_vec_count))), a_vec_count, 0);
190        break :blk @select(i32, select_mask, a_indices, ~b_indices);
191    };
192
193    return @shuffle(Child, a, b, indices);
194}
195
196/// The contents of `interlaced` is evenly split between vec_count vectors that are returned as an array. They "take turns",
197/// receiving one element from `interlaced` at a time.
198pub fn deinterlace(
199    comptime vec_count: usize,
200    interlaced: anytype,
201) [vec_count]@Vector(
202    vectorLength(@TypeOf(interlaced)) / vec_count,
203    std.meta.Child(@TypeOf(interlaced)),
204) {
205    const vec_len = vectorLength(@TypeOf(interlaced)) / vec_count;
206    const Child = std.meta.Child(@TypeOf(interlaced));
207
208    var out: [vec_count]@Vector(vec_len, Child) = undefined;
209
210    comptime var i: usize = 0; // for-loops don't work for this, apparently.
211    inline while (i < out.len) : (i += 1) {
212        const indices = comptime iota(i32, vec_len) * @as(@Vector(vec_len, i32), @splat(@intCast(vec_count))) + @as(@Vector(vec_len, i32), @splat(@intCast(i)));
213        out[i] = @shuffle(Child, interlaced, undefined, indices);
214    }
215
216    return out;
217}
218
219pub fn extract(
220    vec: anytype,
221    comptime first: VectorIndex(@TypeOf(vec)),
222    comptime count: VectorCount(@TypeOf(vec)),
223) @Vector(count, std.meta.Child(@TypeOf(vec))) {
224    const Child = std.meta.Child(@TypeOf(vec));
225    const len = vectorLength(@TypeOf(vec));
226
227    std.debug.assert(@as(comptime_int, @intCast(first)) + @as(comptime_int, @intCast(count)) <= len);
228
229    return @shuffle(Child, vec, undefined, iota(i32, count) + @as(@Vector(count, i32), @splat(@intCast(first))));
230}
231
232test "vector patterns" {
233    if (builtin.cpu.arch == .hexagon) return error.SkipZigTest;
234
235    const base = @Vector(4, u32){ 10, 20, 30, 40 };
236    const other_base = @Vector(4, u32){ 55, 66, 77, 88 };
237
238    const small_bases = [5]@Vector(2, u8){
239        @Vector(2, u8){ 0, 1 },
240        @Vector(2, u8){ 2, 3 },
241        @Vector(2, u8){ 4, 5 },
242        @Vector(2, u8){ 6, 7 },
243        @Vector(2, u8){ 8, 9 },
244    };
245
246    try std.testing.expectEqual([6]u32{ 10, 20, 30, 40, 10, 20 }, repeat(6, base));
247    try std.testing.expectEqual([8]u32{ 10, 20, 30, 40, 55, 66, 77, 88 }, join(base, other_base));
248    try std.testing.expectEqual([2]u32{ 20, 30 }, extract(base, 1, 2));
249
250    if (!builtin.cpu.arch.isMIPS()) {
251        try std.testing.expectEqual([8]u32{ 10, 55, 20, 66, 30, 77, 40, 88 }, interlace(.{ base, other_base }));
252
253        const small_braid = interlace(small_bases);
254        try std.testing.expectEqual([10]u8{ 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 }, small_braid);
255        try std.testing.expectEqual(small_bases, deinterlace(small_bases.len, small_braid));
256    }
257}
258
259/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the length of a and b.
260pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, b))) @TypeOf(a, b) {
261    const len = vectorLength(@TypeOf(a, b));
262
263    return extract(join(a, b), shift, len);
264}
265
266/// Elements are shifted rightwards (towards higher indices). New elements are added to the left, and the rightmost elements are cut off
267/// so that the length of the vector stays the same.
268pub fn shiftElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
269    // It may be possible to implement shifts and rotates with a runtime-friendly slice of two joined vectors, as the length of the
270    // slice would be comptime-known. This would permit vector shifts and rotates by a non-comptime-known amount.
271    // However, I am unsure whether compiler optimizations would handle that well enough on all platforms.
272    const V = @TypeOf(vec);
273    const len = vectorLength(V);
274
275    return mergeShift(@as(V, @splat(shift_in)), vec, len - amount);
276}
277
278/// Elements are shifted leftwards (towards lower indices). New elements are added to the right, and the leftmost elements are cut off
279/// so that no elements with indices below 0 remain.
280pub fn shiftElementsLeft(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
281    const V = @TypeOf(vec);
282
283    return mergeShift(vec, @as(V, @splat(shift_in)), amount);
284}
285
286/// Elements are shifted leftwards (towards lower indices). Elements that leave to the left will reappear to the right in the same order.
287pub fn rotateElementsLeft(vec: anytype, comptime amount: VectorCount(@TypeOf(vec))) @TypeOf(vec) {
288    return mergeShift(vec, vec, amount);
289}
290
291/// Elements are shifted rightwards (towards higher indices). Elements that leave to the right will reappear to the left in the same order.
292pub fn rotateElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec))) @TypeOf(vec) {
293    return rotateElementsLeft(vec, vectorLength(@TypeOf(vec)) - amount);
294}
295
296pub fn reverseOrder(vec: anytype) @TypeOf(vec) {
297    const Child = std.meta.Child(@TypeOf(vec));
298    const len = vectorLength(@TypeOf(vec));
299
300    return @shuffle(Child, vec, undefined, @as(@Vector(len, i32), @splat(@as(i32, @intCast(len)) - 1)) - iota(i32, len));
301}
302
303test "vector shifting" {
304    const base = @Vector(4, u32){ 10, 20, 30, 40 };
305
306    try std.testing.expectEqual([4]u32{ 30, 40, 999, 999 }, shiftElementsLeft(base, 2, 999));
307    try std.testing.expectEqual([4]u32{ 999, 999, 10, 20 }, shiftElementsRight(base, 2, 999));
308    try std.testing.expectEqual([4]u32{ 20, 30, 40, 10 }, rotateElementsLeft(base, 1));
309    try std.testing.expectEqual([4]u32{ 40, 10, 20, 30 }, rotateElementsRight(base, 1));
310    try std.testing.expectEqual([4]u32{ 40, 30, 20, 10 }, reverseOrder(base));
311}
312
313pub fn firstTrue(vec: anytype) ?VectorIndex(@TypeOf(vec)) {
314    const len = vectorLength(@TypeOf(vec));
315    const IndexInt = VectorIndex(@TypeOf(vec));
316
317    if (!@reduce(.Or, vec)) {
318        return null;
319    }
320    const all_max: @Vector(len, IndexInt) = @splat(~@as(IndexInt, 0));
321    const indices = @select(IndexInt, vec, iota(IndexInt, len), all_max);
322    return @reduce(.Min, indices);
323}
324
325pub fn lastTrue(vec: anytype) ?VectorIndex(@TypeOf(vec)) {
326    const len = vectorLength(@TypeOf(vec));
327    const IndexInt = VectorIndex(@TypeOf(vec));
328
329    if (!@reduce(.Or, vec)) {
330        return null;
331    }
332
333    const all_zeroes: @Vector(len, IndexInt) = @splat(0);
334    const indices = @select(IndexInt, vec, iota(IndexInt, len), all_zeroes);
335    return @reduce(.Max, indices);
336}
337
338pub fn countTrues(vec: anytype) VectorCount(@TypeOf(vec)) {
339    const len = vectorLength(@TypeOf(vec));
340    const CountIntType = VectorCount(@TypeOf(vec));
341
342    const all_ones: @Vector(len, CountIntType) = @splat(1);
343    const all_zeroes: @Vector(len, CountIntType) = @splat(0);
344
345    const one_if_true = @select(CountIntType, vec, all_ones, all_zeroes);
346    return @reduce(.Add, one_if_true);
347}
348
349pub fn firstIndexOfValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) ?VectorIndex(@TypeOf(vec)) {
350    const V = @TypeOf(vec);
351
352    return firstTrue(vec == @as(V, @splat(value)));
353}
354
355pub fn lastIndexOfValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) ?VectorIndex(@TypeOf(vec)) {
356    const V = @TypeOf(vec);
357
358    return lastTrue(vec == @as(V, @splat(value)));
359}
360
361pub fn countElementsWithValue(vec: anytype, value: std.meta.Child(@TypeOf(vec))) VectorCount(@TypeOf(vec)) {
362    const V = @TypeOf(vec);
363
364    return countTrues(vec == @as(V, @splat(value)));
365}
366
367test "vector searching" {
368    const base = @Vector(8, u32){ 6, 4, 7, 4, 4, 2, 3, 7 };
369
370    try std.testing.expectEqual(@as(?u3, 1), firstIndexOfValue(base, 4));
371    try std.testing.expectEqual(@as(?u3, 4), lastIndexOfValue(base, 4));
372    try std.testing.expectEqual(@as(?u3, null), lastIndexOfValue(base, 99));
373    try std.testing.expectEqual(@as(u4, 3), countElementsWithValue(base, 4));
374}
375
376/// Same as prefixScan, but with a user-provided, mathematically associative function.
377pub fn prefixScanWithFunc(
378    comptime hop: isize,
379    vec: anytype,
380    /// The error type that `func` might return. Set this to `void` if `func` doesn't return an error union.
381    comptime ErrorType: type,
382    comptime func: fn (@TypeOf(vec), @TypeOf(vec)) if (ErrorType == void) @TypeOf(vec) else ErrorType!@TypeOf(vec),
383    /// When one operand of the operation performed by `func` is this value, the result must equal the other operand.
384    /// For example, this should be 0 for addition or 1 for multiplication.
385    comptime identity: std.meta.Child(@TypeOf(vec)),
386) if (ErrorType == void) @TypeOf(vec) else ErrorType!@TypeOf(vec) {
387    // I haven't debugged this, but it might be a cousin of sorts to what's going on with interlace.
388    if (builtin.cpu.arch.isMIPS()) @compileError("TODO: Find out why prefixScan doesn't work on MIPS");
389
390    const len = vectorLength(@TypeOf(vec));
391
392    if (hop == 0) @compileError("hop can not be 0; you'd be going nowhere forever!");
393    const abs_hop = if (hop < 0) -hop else hop;
394
395    var acc = vec;
396    comptime var i = 0;
397    inline while ((abs_hop << i) < len) : (i += 1) {
398        const shifted = if (hop < 0) shiftElementsLeft(acc, abs_hop << i, identity) else shiftElementsRight(acc, abs_hop << i, identity);
399
400        acc = if (ErrorType == void) func(acc, shifted) else try func(acc, shifted);
401    }
402    return acc;
403}
404
405/// Returns a vector whose elements are the result of performing the specified operation on the corresponding
406/// element of the input vector and every hop'th element that came before it (or after, if hop is negative).
407/// Supports the same operations as the @reduce() builtin. Takes O(logN) to compute.
408/// The scan is not linear, which may affect floating point errors. This may affect the determinism of
409/// algorithms that use this function.
410pub fn prefixScan(comptime op: std.builtin.ReduceOp, comptime hop: isize, vec: anytype) @TypeOf(vec) {
411    const VecType = @TypeOf(vec);
412    const Child = std.meta.Child(VecType);
413
414    const identity = comptime switch (@typeInfo(Child)) {
415        .bool => switch (op) {
416            .Or, .Xor => false,
417            .And => true,
418            else => @compileError("Invalid prefixScan operation " ++ @tagName(op) ++ " for vector of booleans."),
419        },
420        .int => switch (op) {
421            .Max => std.math.minInt(Child),
422            .Add, .Or, .Xor => 0,
423            .Mul => 1,
424            .And, .Min => std.math.maxInt(Child),
425        },
426        .float => switch (op) {
427            .Max => -std.math.inf(Child),
428            .Add => 0,
429            .Mul => 1,
430            .Min => std.math.inf(Child),
431            else => @compileError("Invalid prefixScan operation " ++ @tagName(op) ++ " for vector of floats."),
432        },
433        else => @compileError("Invalid type " ++ @typeName(VecType) ++ " for prefixScan."),
434    };
435
436    const fn_container = struct {
437        fn opFn(a: VecType, b: VecType) VecType {
438            return if (Child == bool) switch (op) {
439                .And => @select(bool, a, b, @as(VecType, @splat(false))),
440                .Or => @select(bool, a, @as(VecType, @splat(true)), b),
441                .Xor => a != b,
442                else => unreachable,
443            } else switch (op) {
444                .And => a & b,
445                .Or => a | b,
446                .Xor => a ^ b,
447                .Add => a + b,
448                .Mul => a * b,
449                .Min => @min(a, b),
450                .Max => @max(a, b),
451            };
452        }
453    };
454
455    return prefixScanWithFunc(hop, vec, void, fn_container.opFn, identity);
456}
457
458test "vector prefix scan" {
459    if (builtin.cpu.arch == .aarch64_be and builtin.zig_backend == .stage2_llvm) return error.SkipZigTest; // https://github.com/ziglang/zig/issues/21893
460    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
461
462    if (builtin.cpu.arch.isMIPS()) return error.SkipZigTest;
463
464    const int_base = @Vector(4, i32){ 11, 23, 9, -21 };
465    const float_base = @Vector(4, f32){ 2, 0.5, -10, 6.54321 };
466    const bool_base = @Vector(4, bool){ true, false, true, false };
467
468    const ones: @Vector(32, u8) = @splat(1);
469
470    try std.testing.expectEqual(iota(u8, 32) + ones, prefixScan(.Add, 1, ones));
471    try std.testing.expectEqual(@Vector(4, i32){ 11, 3, 1, 1 }, prefixScan(.And, 1, int_base));
472    try std.testing.expectEqual(@Vector(4, i32){ 11, 31, 31, -1 }, prefixScan(.Or, 1, int_base));
473    try std.testing.expectEqual(@Vector(4, i32){ 11, 28, 21, -2 }, prefixScan(.Xor, 1, int_base));
474    try std.testing.expectEqual(@Vector(4, i32){ 11, 34, 43, 22 }, prefixScan(.Add, 1, int_base));
475    try std.testing.expectEqual(@Vector(4, i32){ 11, 253, 2277, -47817 }, prefixScan(.Mul, 1, int_base));
476    try std.testing.expectEqual(@Vector(4, i32){ 11, 11, 9, -21 }, prefixScan(.Min, 1, int_base));
477    try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 23, 23 }, prefixScan(.Max, 1, int_base));
478
479    // Trying to predict all inaccuracies when adding and multiplying floats with prefixScans would be a mess, so we don't test those.
480    try std.testing.expectEqual(@Vector(4, f32){ 2, 0.5, -10, -10 }, prefixScan(.Min, 1, float_base));
481    try std.testing.expectEqual(@Vector(4, f32){ 2, 2, 2, 6.54321 }, prefixScan(.Max, 1, float_base));
482
483    try std.testing.expectEqual(@Vector(4, bool){ true, true, false, false }, prefixScan(.Xor, 1, bool_base));
484    try std.testing.expectEqual(@Vector(4, bool){ true, true, true, true }, prefixScan(.Or, 1, bool_base));
485    try std.testing.expectEqual(@Vector(4, bool){ true, false, false, false }, prefixScan(.And, 1, bool_base));
486
487    try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 20, 2 }, prefixScan(.Add, 2, int_base));
488    try std.testing.expectEqual(@Vector(4, i32){ 22, 11, -12, -21 }, prefixScan(.Add, -1, int_base));
489    try std.testing.expectEqual(@Vector(4, i32){ 11, 23, 9, -10 }, prefixScan(.Add, 3, int_base));
490}