Commit add2976a9b

mlugg <mlugg@mlugg.co.uk>
2025-05-26 06:07:13
compiler: implement better shuffle AIR
Runtime `@shuffle` has two cases which backends generally want to handle differently for efficiency: * One runtime vector operand; some result elements may be comptime-known * Two runtime vector operands; some result elements may be undefined The latter case happens if both vectors given to `@shuffle` are runtime-known and they are both used (i.e. the mask refers to them). Otherwise, if the result is not entirely comptime-known, we are in the former case. `Sema` now diffentiates these two cases in the AIR so that backends can easily handle them however they want to. Note that this *doesn't* really involve Sema doing any more work than it would otherwise need to, so there's not really a negative here! Most existing backends have their lowerings for `@shuffle` migrated in this commit. The LLVM backend uses new lowerings suggested by Jacob as ones which it will handle effectively. The x86_64 backend has not yet been migrated; for now there's a panic in there. Jacob will implement that before this is merged anywhere.
1 parent b48d6ff
src/Air/Liveness/Verify.zig
@@ -1,6 +1,7 @@
 //! Verifies that Liveness information is valid.
 
 gpa: std.mem.Allocator,
+zcu: *Zcu,
 air: Air,
 liveness: Liveness,
 live: LiveMap = .{},
@@ -287,10 +288,13 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void {
                 const extra = self.air.extraData(Air.Bin, ty_pl.payload).data;
                 try self.verifyInstOperands(inst, .{ extra.lhs, extra.rhs, .none });
             },
-            .shuffle => {
-                const ty_pl = data[@intFromEnum(inst)].ty_pl;
-                const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-                try self.verifyInstOperands(inst, .{ extra.a, extra.b, .none });
+            .shuffle_one => {
+                const unwrapped = self.air.unwrapShuffleOne(self.zcu, inst);
+                try self.verifyInstOperands(inst, .{ unwrapped.operand, .none, .none });
+            },
+            .shuffle_two => {
+                const unwrapped = self.air.unwrapShuffleTwo(self.zcu, inst);
+                try self.verifyInstOperands(inst, .{ unwrapped.operand_a, unwrapped.operand_b, .none });
             },
             .cmp_vector,
             .cmp_vector_optimized,
@@ -639,4 +643,5 @@ const log = std.log.scoped(.liveness_verify);
 const Air = @import("../../Air.zig");
 const Liveness = @import("../Liveness.zig");
 const InternPool = @import("../../InternPool.zig");
+const Zcu = @import("../../Zcu.zig");
 const Verify = @This();
src/Air/Legalize.zig
@@ -521,7 +521,8 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
                 }
             },
             .splat,
-            .shuffle,
+            .shuffle_one,
+            .shuffle_two,
             => {},
             .select,
             => if (l.features.contains(.scalarize_select)) continue :inst try l.scalarize(inst, .select_pl_op_bin),
src/Air/Liveness.zig
@@ -15,6 +15,7 @@ const Liveness = @This();
 const trace = @import("../tracy.zig").trace;
 const Air = @import("../Air.zig");
 const InternPool = @import("../InternPool.zig");
+const Zcu = @import("../Zcu.zig");
 
 pub const Verify = @import("Liveness/Verify.zig");
 
@@ -136,12 +137,15 @@ fn LivenessPassData(comptime pass: LivenessPass) type {
     };
 }
 
-pub fn analyze(gpa: Allocator, air: Air, intern_pool: *InternPool) Allocator.Error!Liveness {
+pub fn analyze(zcu: *Zcu, air: Air, intern_pool: *InternPool) Allocator.Error!Liveness {
     const tracy = trace(@src());
     defer tracy.end();
 
+    const gpa = zcu.gpa;
+
     var a: Analysis = .{
         .gpa = gpa,
+        .zcu = zcu,
         .air = air,
         .tomb_bits = try gpa.alloc(
             usize,
@@ -220,6 +224,7 @@ const OperandCategory = enum {
 pub fn categorizeOperand(
     l: Liveness,
     air: Air,
+    zcu: *Zcu,
     inst: Air.Inst.Index,
     operand: Air.Inst.Index,
     ip: *const InternPool,
@@ -511,10 +516,15 @@ pub fn categorizeOperand(
             if (extra.rhs == operand_ref) return matchOperandSmallIndex(l, inst, 2, .none);
             return .none;
         },
-        .shuffle => {
-            const extra = air.extraData(Air.Shuffle, air_datas[@intFromEnum(inst)].ty_pl.payload).data;
-            if (extra.a == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
-            if (extra.b == operand_ref) return matchOperandSmallIndex(l, inst, 1, .none);
+        .shuffle_one => {
+            const unwrapped = air.unwrapShuffleOne(zcu, inst);
+            if (unwrapped.operand == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
+            return .none;
+        },
+        .shuffle_two => {
+            const unwrapped = air.unwrapShuffleTwo(zcu, inst);
+            if (unwrapped.operand_a == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
+            if (unwrapped.operand_b == operand_ref) return matchOperandSmallIndex(l, inst, 1, .none);
             return .none;
         },
         .reduce, .reduce_optimized => {
@@ -639,7 +649,7 @@ pub fn categorizeOperand(
 
                 var operand_live: bool = true;
                 for (&[_]Air.Inst.Index{ then_body[0], else_body[0] }) |cond_inst| {
-                    if (l.categorizeOperand(air, cond_inst, operand, ip) == .tomb)
+                    if (l.categorizeOperand(air, zcu, cond_inst, operand, ip) == .tomb)
                         operand_live = false;
 
                     switch (air_tags[@intFromEnum(cond_inst)]) {
@@ -824,6 +834,7 @@ pub const BigTomb = struct {
 /// In-progress data; on successful analysis converted into `Liveness`.
 const Analysis = struct {
     gpa: Allocator,
+    zcu: *Zcu,
     air: Air,
     intern_pool: *InternPool,
     tomb_bits: []usize,
@@ -1119,9 +1130,13 @@ fn analyzeInst(
             const extra = a.air.extraData(Air.Bin, pl_op.payload).data;
             return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, extra.lhs, extra.rhs });
         },
-        .shuffle => {
-            const extra = a.air.extraData(Air.Shuffle, inst_datas[@intFromEnum(inst)].ty_pl.payload).data;
-            return analyzeOperands(a, pass, data, inst, .{ extra.a, extra.b, .none });
+        .shuffle_one => {
+            const unwrapped = a.air.unwrapShuffleOne(a.zcu, inst);
+            return analyzeOperands(a, pass, data, inst, .{ unwrapped.operand, .none, .none });
+        },
+        .shuffle_two => {
+            const unwrapped = a.air.unwrapShuffleTwo(a.zcu, inst);
+            return analyzeOperands(a, pass, data, inst, .{ unwrapped.operand_a, unwrapped.operand_b, .none });
         },
         .reduce, .reduce_optimized => {
             const reduce = inst_datas[@intFromEnum(inst)].reduce;
src/Air/types_resolved.zig
@@ -249,12 +249,22 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
                 if (!checkRef(extra.struct_operand, zcu)) return false;
             },
 
-            .shuffle => {
-                const extra = air.extraData(Air.Shuffle, data.ty_pl.payload).data;
-                if (!checkType(data.ty_pl.ty.toType(), zcu)) return false;
-                if (!checkRef(extra.a, zcu)) return false;
-                if (!checkRef(extra.b, zcu)) return false;
-                if (!checkVal(Value.fromInterned(extra.mask), zcu)) return false;
+            .shuffle_one => {
+                const unwrapped = air.unwrapShuffleOne(zcu, inst);
+                if (!checkType(unwrapped.result_ty, zcu)) return false;
+                if (!checkRef(unwrapped.operand, zcu)) return false;
+                for (unwrapped.mask) |m| switch (m.unwrap()) {
+                    .elem => {},
+                    .value => |val| if (!checkVal(.fromInterned(val), zcu)) return false,
+                };
+            },
+
+            .shuffle_two => {
+                const unwrapped = air.unwrapShuffleTwo(zcu, inst);
+                if (!checkType(unwrapped.result_ty, zcu)) return false;
+                if (!checkRef(unwrapped.operand_a, zcu)) return false;
+                if (!checkRef(unwrapped.operand_b, zcu)) return false;
+                // No values to check because there are no comptime-known values other than undef
             },
 
             .cmpxchg_weak,
src/arch/aarch64/CodeGen.zig
@@ -778,7 +778,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try self.airErrorName(inst),
             .splat           => try self.airSplat(inst),
             .select          => try self.airSelect(inst),
-            .shuffle         => try self.airShuffle(inst),
+            .shuffle_one     => try self.airShuffleOne(inst),
+            .shuffle_two     => try self.airShuffleTwo(inst),
             .reduce          => try self.airReduce(inst),
             .aggregate_init  => try self.airAggregateInit(inst),
             .union_init      => try self.airUnionInit(inst),
@@ -6049,11 +6050,14 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) InnerError!void {
     return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
-fn airShuffle(self: *Self, inst: Air.Inst.Index) InnerError!void {
-    const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airShuffle for {}", .{self.target.cpu.arch});
-    return self.finishAir(inst, result, .{ extra.a, extra.b, .none });
+fn airShuffleOne(self: *Self, inst: Air.Inst.Index) InnerError!void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleOne for {}", .{self.target.cpu.arch});
+}
+
+fn airShuffleTwo(self: *Self, inst: Air.Inst.Index) InnerError!void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleTwo for {}", .{self.target.cpu.arch});
 }
 
 fn airReduce(self: *Self, inst: Air.Inst.Index) InnerError!void {
src/arch/arm/CodeGen.zig
@@ -767,7 +767,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try self.airErrorName(inst),
             .splat           => try self.airSplat(inst),
             .select          => try self.airSelect(inst),
-            .shuffle         => try self.airShuffle(inst),
+            .shuffle_one     => try self.airShuffleOne(inst),
+            .shuffle_two     => try self.airShuffleTwo(inst),
             .reduce          => try self.airReduce(inst),
             .aggregate_init  => try self.airAggregateInit(inst),
             .union_init      => try self.airUnionInit(inst),
@@ -6021,10 +6022,14 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
-fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
-    const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airShuffle for arm", .{});
-    return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+fn airShuffleOne(self: *Self, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleOne for arm", .{});
+}
+
+fn airShuffleTwo(self: *Self, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return self.fail("TODO implement airShuffleTwo for arm", .{});
 }
 
 fn airReduce(self: *Self, inst: Air.Inst.Index) !void {
src/arch/riscv64/CodeGen.zig
@@ -1586,7 +1586,8 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try func.airErrorName(inst),
             .splat           => try func.airSplat(inst),
             .select          => try func.airSelect(inst),
-            .shuffle         => try func.airShuffle(inst),
+            .shuffle_one     => try func.airShuffleOne(inst),
+            .shuffle_two     => try func.airShuffleTwo(inst),
             .reduce          => try func.airReduce(inst),
             .aggregate_init  => try func.airAggregateInit(inst),
             .union_init      => try func.airUnionInit(inst),
@@ -8030,10 +8031,14 @@ fn airSelect(func: *Func, inst: Air.Inst.Index) !void {
     return func.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
-fn airShuffle(func: *Func, inst: Air.Inst.Index) !void {
-    const ty_op = func.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = if (func.liveness.isUnused(inst)) .unreach else return func.fail("TODO implement airShuffle for riscv64", .{});
-    return func.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+fn airShuffleOne(func: *Func, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return func.fail("TODO implement airShuffleOne for riscv64", .{});
+}
+
+fn airShuffleTwo(func: *Func, inst: Air.Inst.Index) !void {
+    _ = inst;
+    return func.fail("TODO implement airShuffleTwo for riscv64", .{});
 }
 
 fn airReduce(func: *Func, inst: Air.Inst.Index) !void {
src/arch/sparc64/CodeGen.zig
@@ -621,7 +621,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .error_name      => try self.airErrorName(inst),
             .splat           => try self.airSplat(inst),
             .select          => @panic("TODO try self.airSelect(inst)"),
-            .shuffle         => @panic("TODO try self.airShuffle(inst)"),
+            .shuffle_one     => @panic("TODO try self.airShuffleOne(inst)"),
+            .shuffle_two     => @panic("TODO try self.airShuffleTwo(inst)"),
             .reduce          => @panic("TODO try self.airReduce(inst)"),
             .aggregate_init  => try self.airAggregateInit(inst),
             .union_init      => try self.airUnionInit(inst),
src/arch/wasm/CodeGen.zig
@@ -2004,7 +2004,8 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
         .ret_load => cg.airRetLoad(inst),
         .splat => cg.airSplat(inst),
         .select => cg.airSelect(inst),
-        .shuffle => cg.airShuffle(inst),
+        .shuffle_one => cg.airShuffleOne(inst),
+        .shuffle_two => cg.airShuffleTwo(inst),
         .reduce => cg.airReduce(inst),
         .aggregate_init => cg.airAggregateInit(inst),
         .union_init => cg.airUnionInit(inst),
@@ -5177,66 +5178,100 @@ fn airSelect(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     return cg.fail("TODO: Implement wasm airSelect", .{});
 }
 
-fn airShuffle(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+fn airShuffleOne(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const pt = cg.pt;
     const zcu = pt.zcu;
-    const inst_ty = cg.typeOfIndex(inst);
-    const ty_pl = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    const extra = cg.air.extraData(Air.Shuffle, ty_pl.payload).data;
-
-    const a = try cg.resolveInst(extra.a);
-    const b = try cg.resolveInst(extra.b);
-    const mask = Value.fromInterned(extra.mask);
-    const mask_len = extra.mask_len;
 
-    const child_ty = inst_ty.childType(zcu);
-    const elem_size = child_ty.abiSize(zcu);
+    const unwrapped = cg.air.unwrapShuffleOne(zcu, inst);
+    const result_ty = unwrapped.result_ty;
+    const mask = unwrapped.mask;
+    const operand = try cg.resolveInst(unwrapped.operand);
 
-    // TODO: One of them could be by ref; handle in loop
-    if (isByRef(cg.typeOf(extra.a), zcu, cg.target) or isByRef(inst_ty, zcu, cg.target)) {
-        const result = try cg.allocStack(inst_ty);
+    const elem_ty = result_ty.childType(zcu);
+    const elem_size = elem_ty.abiSize(zcu);
 
-        for (0..mask_len) |index| {
-            const value = (try mask.elemValue(pt, index)).toSignedInt(zcu);
+    // TODO: this function could have an `i8x16_shuffle` fast path like `airShuffleTwo` if we were
+    // to lower the comptime-known operands to a non-by-ref vector value.
 
-            try cg.emitWValue(result);
+    // TODO: this is incorrect if either operand or the result is *not* by-ref, which is possible.
+    // I tried to fix it, but I couldn't make much sense of how this backend handles memory.
 
-            const loaded = if (value >= 0)
-                try cg.load(a, child_ty, @as(u32, @intCast(@as(i64, @intCast(elem_size)) * value)))
-            else
-                try cg.load(b, child_ty, @as(u32, @intCast(@as(i64, @intCast(elem_size)) * ~value)));
+    const dest_alloc = try cg.allocStack(result_ty);
+    for (mask, 0..) |mask_elem, out_idx| {
+        try cg.emitWValue(dest_alloc);
+        const elem_val = switch (mask_elem.unwrap()) {
+            .elem => |idx| try cg.load(operand, elem_ty, @intCast(elem_size * idx)),
+            .value => |val| try cg.lowerConstant(.fromInterned(val), elem_ty),
+        };
+        try cg.store(.stack, elem_val, elem_ty, @intCast(dest_alloc.offset() + elem_size * out_idx));
+    }
+    return cg.finishAir(inst, dest_alloc, &.{unwrapped.operand});
+}
 
-            try cg.store(.stack, loaded, child_ty, result.stack_offset.value + @as(u32, @intCast(elem_size)) * @as(u32, @intCast(index)));
-        }
+fn airShuffleTwo(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
+    const pt = cg.pt;
+    const zcu = pt.zcu;
 
-        return cg.finishAir(inst, result, &.{ extra.a, extra.b });
-    } else {
-        var operands = [_]u32{
-            @intFromEnum(std.wasm.SimdOpcode.i8x16_shuffle),
-        } ++ [1]u32{undefined} ** 4;
+    const unwrapped = cg.air.unwrapShuffleTwo(zcu, inst);
+    const result_ty = unwrapped.result_ty;
+    const mask = unwrapped.mask;
+    const operand_a = try cg.resolveInst(unwrapped.operand_a);
+    const operand_b = try cg.resolveInst(unwrapped.operand_b);
 
-        var lanes = mem.asBytes(operands[1..]);
-        for (0..@as(usize, @intCast(mask_len))) |index| {
-            const mask_elem = (try mask.elemValue(pt, index)).toSignedInt(zcu);
-            const base_index = if (mask_elem >= 0)
-                @as(u8, @intCast(@as(i64, @intCast(elem_size)) * mask_elem))
-            else
-                16 + @as(u8, @intCast(@as(i64, @intCast(elem_size)) * ~mask_elem));
+    const a_ty = cg.typeOf(unwrapped.operand_a);
+    const b_ty = cg.typeOf(unwrapped.operand_b);
+    const elem_ty = result_ty.childType(zcu);
+    const elem_size = elem_ty.abiSize(zcu);
 
-            for (0..@as(usize, @intCast(elem_size))) |byte_offset| {
-                lanes[index * @as(usize, @intCast(elem_size)) + byte_offset] = base_index + @as(u8, @intCast(byte_offset));
+    // WASM has `i8x16_shuffle`, which we can apply if the element type bit size is a multiple of 8
+    // and the input and output vectors have a bit size of 128 (and are hence not by-ref). Otherwise,
+    // we fall back to a naive loop lowering.
+    if (!isByRef(a_ty, zcu, cg.target) and
+        !isByRef(b_ty, zcu, cg.target) and
+        !isByRef(result_ty, zcu, cg.target) and
+        elem_ty.bitSize(zcu) % 8 == 0)
+    {
+        var lane_map: [16]u8 align(4) = undefined;
+        const lanes_per_elem = elem_ty.bitSize(zcu) / 8;
+        for (mask, 0..) |mask_elem, out_idx| {
+            const out_first_lane = out_idx * lanes_per_elem;
+            const in_first_lane = switch (mask_elem.unwrap()) {
+                .a_elem => |i| i * lanes_per_elem,
+                .b_elem => |i| i * lanes_per_elem + 16,
+                .undef => 0, // doesn't matter
+            };
+            for (lane_map[out_first_lane..][0..lanes_per_elem], in_first_lane..) |*out, in| {
+                out.* = @intCast(in);
             }
         }
-
-        try cg.emitWValue(a);
-        try cg.emitWValue(b);
-
+        try cg.emitWValue(operand_a);
+        try cg.emitWValue(operand_b);
         const extra_index = cg.extraLen();
-        try cg.mir_extra.appendSlice(cg.gpa, &operands);
+        try cg.mir_extra.appendSlice(cg.gpa, &.{
+            @intFromEnum(std.wasm.SimdOpcode.i8x16_shuffle),
+            @bitCast(lane_map[0..4].*),
+            @bitCast(lane_map[4..8].*),
+            @bitCast(lane_map[8..12].*),
+            @bitCast(lane_map[12..].*),
+        });
         try cg.addInst(.{ .tag = .simd_prefix, .data = .{ .payload = extra_index } });
+        return cg.finishAir(inst, .stack, &.{ unwrapped.operand_a, unwrapped.operand_b });
+    }
+
+    // TODO: this is incorrect if either operand or the result is *not* by-ref, which is possible.
+    // I tried to fix it, but I couldn't make much sense of how this backend handles memory.
 
-        return cg.finishAir(inst, .stack, &.{ extra.a, extra.b });
+    const dest_alloc = try cg.allocStack(result_ty);
+    for (mask, 0..) |mask_elem, out_idx| {
+        try cg.emitWValue(dest_alloc);
+        const elem_val = switch (mask_elem.unwrap()) {
+            .a_elem => |idx| try cg.load(operand_a, elem_ty, @intCast(elem_size * idx)),
+            .b_elem => |idx| try cg.load(operand_b, elem_ty, @intCast(elem_size * idx)),
+            .undef => try cg.emitUndefined(elem_ty),
+        };
+        try cg.store(.stack, elem_val, elem_ty, @intCast(dest_alloc.offset() + elem_size * out_idx));
     }
+    return cg.finishAir(inst, dest_alloc, &.{ unwrapped.operand_a, unwrapped.operand_b });
 }
 
 fn airReduce(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
src/arch/x86_64/CodeGen.zig
@@ -2490,7 +2490,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
         switch (air_tags[@intFromEnum(inst)]) {
             // zig fmt: off
             .select           => try cg.airSelect(inst),
-            .shuffle          => try cg.airShuffle(inst),
+            .shuffle_one, .shuffle_two => @panic("x86_64 TODO: shuffle_one/shuffle_two"),
             // zig fmt: on
 
             .arg => if (cg.debug_output != .none) {
src/codegen/c.zig
@@ -3374,7 +3374,8 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
             .error_name       => try airErrorName(f, inst),
             .splat            => try airSplat(f, inst),
             .select           => try airSelect(f, inst),
-            .shuffle          => try airShuffle(f, inst),
+            .shuffle_one      => try airShuffleOne(f, inst),
+            .shuffle_two      => try airShuffleTwo(f, inst),
             .reduce           => try airReduce(f, inst),
             .aggregate_init   => try airAggregateInit(f, inst),
             .union_init       => try airUnionInit(f, inst),
@@ -7163,34 +7164,73 @@ fn airSelect(f: *Function, inst: Air.Inst.Index) !CValue {
     return local;
 }
 
-fn airShuffle(f: *Function, inst: Air.Inst.Index) !CValue {
+fn airShuffleOne(f: *Function, inst: Air.Inst.Index) !CValue {
     const pt = f.object.dg.pt;
     const zcu = pt.zcu;
-    const ty_pl = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-    const extra = f.air.extraData(Air.Shuffle, ty_pl.payload).data;
-
-    const mask = Value.fromInterned(extra.mask);
-    const lhs = try f.resolveInst(extra.a);
-    const rhs = try f.resolveInst(extra.b);
 
-    const inst_ty = f.typeOfIndex(inst);
+    const unwrapped = f.air.unwrapShuffleOne(zcu, inst);
+    const mask = unwrapped.mask;
+    const operand = try f.resolveInst(unwrapped.operand);
+    const inst_ty = unwrapped.result_ty;
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
-    try reap(f, inst, &.{ extra.a, extra.b }); // local cannot alias operands
-    for (0..extra.mask_len) |index| {
+    try reap(f, inst, &.{unwrapped.operand}); // local cannot alias operand
+    for (mask, 0..) |mask_elem, out_idx| {
         try f.writeCValue(writer, local, .Other);
         try writer.writeByte('[');
-        try f.object.dg.renderValue(writer, try pt.intValue(.usize, index), .Other);
+        try f.object.dg.renderValue(writer, try pt.intValue(.usize, out_idx), .Other);
         try writer.writeAll("] = ");
+        switch (mask_elem.unwrap()) {
+            .elem => |src_idx| {
+                try f.writeCValue(writer, operand, .Other);
+                try writer.writeByte('[');
+                try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other);
+                try writer.writeByte(']');
+            },
+            .value => |val| try f.object.dg.renderValue(writer, .fromInterned(val), .Other),
+        }
+        try writer.writeAll(";\n");
+    }
 
-        const mask_elem = (try mask.elemValue(pt, index)).toSignedInt(zcu);
-        const src_val = try pt.intValue(.usize, @as(u64, @intCast(mask_elem ^ mask_elem >> 63)));
+    return local;
+}
 
-        try f.writeCValue(writer, if (mask_elem >= 0) lhs else rhs, .Other);
+fn airShuffleTwo(f: *Function, inst: Air.Inst.Index) !CValue {
+    const pt = f.object.dg.pt;
+    const zcu = pt.zcu;
+
+    const unwrapped = f.air.unwrapShuffleTwo(zcu, inst);
+    const mask = unwrapped.mask;
+    const operand_a = try f.resolveInst(unwrapped.operand_a);
+    const operand_b = try f.resolveInst(unwrapped.operand_b);
+    const inst_ty = unwrapped.result_ty;
+    const elem_ty = inst_ty.childType(zcu);
+
+    const writer = f.object.writer();
+    const local = try f.allocLocal(inst, inst_ty);
+    try reap(f, inst, &.{ unwrapped.operand_a, unwrapped.operand_b }); // local cannot alias operands
+    for (mask, 0..) |mask_elem, out_idx| {
+        try f.writeCValue(writer, local, .Other);
         try writer.writeByte('[');
-        try f.object.dg.renderValue(writer, src_val, .Other);
-        try writer.writeAll("];\n");
+        try f.object.dg.renderValue(writer, try pt.intValue(.usize, out_idx), .Other);
+        try writer.writeAll("] = ");
+        switch (mask_elem.unwrap()) {
+            .a_elem => |src_idx| {
+                try f.writeCValue(writer, operand_a, .Other);
+                try writer.writeByte('[');
+                try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other);
+                try writer.writeByte(']');
+            },
+            .b_elem => |src_idx| {
+                try f.writeCValue(writer, operand_b, .Other);
+                try writer.writeByte('[');
+                try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other);
+                try writer.writeByte(']');
+            },
+            .undef => try f.object.dg.renderUndefValue(writer, elem_ty, .Other),
+        }
+        try writer.writeAll(";\n");
     }
 
     return local;
src/codegen/llvm.zig
@@ -4969,7 +4969,8 @@ pub const FuncGen = struct {
                 .error_name     => try self.airErrorName(inst),
                 .splat          => try self.airSplat(inst),
                 .select         => try self.airSelect(inst),
-                .shuffle        => try self.airShuffle(inst),
+                .shuffle_one    => try self.airShuffleOne(inst),
+                .shuffle_two    => try self.airShuffleTwo(inst),
                 .aggregate_init => try self.airAggregateInit(inst),
                 .union_init     => try self.airUnionInit(inst),
                 .prefetch       => try self.airPrefetch(inst),
@@ -9666,7 +9667,7 @@ pub const FuncGen = struct {
         const zcu = o.pt.zcu;
         const ip = &zcu.intern_pool;
         for (body_tail[1..]) |body_inst| {
-            switch (fg.liveness.categorizeOperand(fg.air, body_inst, body_tail[0], ip)) {
+            switch (fg.liveness.categorizeOperand(fg.air, zcu, body_inst, body_tail[0], ip)) {
                 .none => continue,
                 .write, .noret, .complex => return false,
                 .tomb => return true,
@@ -10421,42 +10422,192 @@ pub const FuncGen = struct {
         return self.wip.select(.normal, pred, a, b, "");
     }
 
-    fn airShuffle(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
-        const o = self.ng.object;
+    fn airShuffleOne(fg: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+        const o = fg.ng.object;
         const pt = o.pt;
         const zcu = pt.zcu;
-        const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-        const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-        const a = try self.resolveInst(extra.a);
-        const b = try self.resolveInst(extra.b);
-        const mask = Value.fromInterned(extra.mask);
-        const mask_len = extra.mask_len;
-        const a_len = self.typeOf(extra.a).vectorLen(zcu);
-
-        // LLVM uses integers larger than the length of the first array to
-        // index into the second array. This was deemed unnecessarily fragile
-        // when changing code, so Zig uses negative numbers to index the
-        // second vector. These start at -1 and go down, and are easiest to use
-        // with the ~ operator. Here we convert between the two formats.
-        const values = try self.gpa.alloc(Builder.Constant, mask_len);
-        defer self.gpa.free(values);
-
-        for (values, 0..) |*val, i| {
-            const elem = try mask.elemValue(pt, i);
-            if (elem.isUndef(zcu)) {
-                val.* = try o.builder.undefConst(.i32);
-            } else {
-                const int = elem.toSignedInt(zcu);
-                const unsigned: u32 = @intCast(if (int >= 0) int else ~int + a_len);
-                val.* = try o.builder.intConst(.i32, unsigned);
+        const gpa = zcu.gpa;
+
+        const unwrapped = fg.air.unwrapShuffleOne(zcu, inst);
+
+        const operand = try fg.resolveInst(unwrapped.operand);
+        const mask = unwrapped.mask;
+        const operand_ty = fg.typeOf(unwrapped.operand);
+        const llvm_operand_ty = try o.lowerType(operand_ty);
+        const llvm_result_ty = try o.lowerType(unwrapped.result_ty);
+        const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu));
+        const llvm_poison_elem = try o.builder.poisonConst(llvm_elem_ty);
+        const llvm_poison_mask_elem = try o.builder.poisonConst(.i32);
+        const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32);
+
+        // LLVM requires that the two input vectors have the same length, so lowering isn't trivial.
+        // And, in the words of jacobly0: "llvm sucks at shuffles so we do have to hold its hand at
+        // least a bit". So, there are two cases here.
+        //
+        // If the operand length equals the mask length, we do just the one `shufflevector`, where
+        // the second operand is a constant vector with comptime-known elements at the right indices
+        // and poison values elsewhere (in the indices which won't be selected).
+        //
+        // Otherwise, we lower to *two* `shufflevector` instructions. The first shuffles the runtime
+        // operand with an all-poison vector to extract and correctly position all of the runtime
+        // elements. We also make a constant vector with all of the comptime elements correctly
+        // positioned. Then, our second instruction selects elements from those "runtime-or-poison"
+        // and "comptime-or-poison" vectors to compute the result.
+
+        // This buffer is used primarily for the mask constants.
+        const llvm_elem_buf = try gpa.alloc(Builder.Constant, mask.len);
+        defer gpa.free(llvm_elem_buf);
+
+        // ...but first, we'll collect all of the comptime-known values.
+        var any_defined_comptime_value = false;
+        for (mask, llvm_elem_buf) |mask_elem, *llvm_elem| {
+            llvm_elem.* = switch (mask_elem.unwrap()) {
+                .elem => llvm_poison_elem,
+                .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) elem: {
+                    any_defined_comptime_value = true;
+                    break :elem try o.lowerValue(val);
+                } else llvm_poison_elem,
+            };
+        }
+        // This vector is like the result, but runtime elements are replaced with poison.
+        const comptime_and_poison: Builder.Value = if (any_defined_comptime_value) vec: {
+            break :vec try o.builder.vectorValue(llvm_result_ty, llvm_elem_buf);
+        } else try o.builder.poisonValue(llvm_result_ty);
+
+        if (operand_ty.vectorLen(zcu) == mask.len) {
+            // input length equals mask/output length, so we lower to one instruction
+            for (mask, llvm_elem_buf, 0..) |mask_elem, *llvm_elem, elem_idx| {
+                llvm_elem.* = switch (mask_elem.unwrap()) {
+                    .elem => |idx| try o.builder.intConst(.i32, idx),
+                    .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) mask_val: {
+                        break :mask_val try o.builder.intConst(.i32, mask.len + elem_idx);
+                    } else llvm_poison_mask_elem,
+                };
             }
+            return fg.wip.shuffleVector(
+                operand,
+                comptime_and_poison,
+                try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf),
+                "",
+            );
+        }
+
+        for (mask, llvm_elem_buf) |mask_elem, *llvm_elem| {
+            llvm_elem.* = switch (mask_elem.unwrap()) {
+                .elem => |idx| try o.builder.intConst(.i32, idx),
+                .value => llvm_poison_mask_elem,
+            };
+        }
+        // This vector is like our result, but all comptime-known elements are poison.
+        const runtime_and_poison = try fg.wip.shuffleVector(
+            operand,
+            try o.builder.poisonValue(llvm_operand_ty),
+            try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf),
+            "",
+        );
+
+        if (!any_defined_comptime_value) {
+            // `comptime_and_poison` is just poison; a second shuffle would be a nop.
+            return runtime_and_poison;
+        }
+
+        // In this second shuffle, the inputs, the mask, and the output all have the same length.
+        for (mask, llvm_elem_buf, 0..) |mask_elem, *llvm_elem, elem_idx| {
+            llvm_elem.* = switch (mask_elem.unwrap()) {
+                .elem => try o.builder.intConst(.i32, elem_idx),
+                .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) mask_val: {
+                    break :mask_val try o.builder.intConst(.i32, mask.len + elem_idx);
+                } else llvm_poison_mask_elem,
+            };
         }
+        // Merge the runtime and comptime elements with the mask we just built.
+        return fg.wip.shuffleVector(
+            runtime_and_poison,
+            comptime_and_poison,
+            try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf),
+            "",
+        );
+    }
+
+    fn airShuffleTwo(fg: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+        const o = fg.ng.object;
+        const pt = o.pt;
+        const zcu = pt.zcu;
+        const gpa = zcu.gpa;
+
+        const unwrapped = fg.air.unwrapShuffleTwo(zcu, inst);
+
+        const mask = unwrapped.mask;
+        const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu));
+        const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32);
+        const llvm_poison_mask_elem = try o.builder.poisonConst(.i32);
+
+        // This is kind of simpler than in `airShuffleOne`. We extend the shorter vector to the
+        // length of the longer one with an initial `shufflevector` if necessary, and then do the
+        // actual computation with a second `shufflevector`.
+
+        const operand_a_len = fg.typeOf(unwrapped.operand_a).vectorLen(zcu);
+        const operand_b_len = fg.typeOf(unwrapped.operand_b).vectorLen(zcu);
+        const operand_len: u32 = @max(operand_a_len, operand_b_len);
+
+        // If we need to extend an operand, this is the type that mask will have.
+        const llvm_operand_mask_ty = try o.builder.vectorType(.normal, operand_len, .i32);
+
+        const llvm_elem_buf = try gpa.alloc(Builder.Constant, @max(mask.len, operand_len));
+        defer gpa.free(llvm_elem_buf);
 
-        const llvm_mask_value = try o.builder.vectorValue(
-            try o.builder.vectorType(.normal, mask_len, .i32),
-            values,
+        const operand_a: Builder.Value = extend: {
+            const raw = try fg.resolveInst(unwrapped.operand_a);
+            if (operand_a_len == operand_len) break :extend raw;
+            // Extend with a `shufflevector`, with a mask `<0, 1, ..., n, poison, poison, ..., poison>`
+            const mask_elems = llvm_elem_buf[0..operand_len];
+            for (mask_elems[0..operand_a_len], 0..) |*llvm_elem, elem_idx| {
+                llvm_elem.* = try o.builder.intConst(.i32, elem_idx);
+            }
+            @memset(mask_elems[operand_a_len..], llvm_poison_mask_elem);
+            const llvm_this_operand_ty = try o.builder.vectorType(.normal, operand_a_len, llvm_elem_ty);
+            break :extend try fg.wip.shuffleVector(
+                raw,
+                try o.builder.poisonValue(llvm_this_operand_ty),
+                try o.builder.vectorValue(llvm_operand_mask_ty, mask_elems),
+                "",
+            );
+        };
+        const operand_b: Builder.Value = extend: {
+            const raw = try fg.resolveInst(unwrapped.operand_b);
+            if (operand_b_len == operand_len) break :extend raw;
+            // Extend with a `shufflevector`, with a mask `<0, 1, ..., n, poison, poison, ..., poison>`
+            const mask_elems = llvm_elem_buf[0..operand_len];
+            for (mask_elems[0..operand_b_len], 0..) |*llvm_elem, elem_idx| {
+                llvm_elem.* = try o.builder.intConst(.i32, elem_idx);
+            }
+            @memset(mask_elems[operand_b_len..], llvm_poison_mask_elem);
+            const llvm_this_operand_ty = try o.builder.vectorType(.normal, operand_b_len, llvm_elem_ty);
+            break :extend try fg.wip.shuffleVector(
+                raw,
+                try o.builder.poisonValue(llvm_this_operand_ty),
+                try o.builder.vectorValue(llvm_operand_mask_ty, mask_elems),
+                "",
+            );
+        };
+
+        // `operand_a` and `operand_b` now have the same length (we've extended the shorter one with
+        // an initial shuffle if necessary). Now for the easy bit.
+
+        const mask_elems = llvm_elem_buf[0..mask.len];
+        for (mask, mask_elems) |mask_elem, *llvm_mask_elem| {
+            llvm_mask_elem.* = switch (mask_elem.unwrap()) {
+                .a_elem => |idx| try o.builder.intConst(.i32, idx),
+                .b_elem => |idx| try o.builder.intConst(.i32, operand_len + idx),
+                .undef => llvm_poison_mask_elem,
+            };
+        }
+        return fg.wip.shuffleVector(
+            operand_a,
+            operand_b,
+            try o.builder.vectorValue(llvm_mask_ty, mask_elems),
+            "",
         );
-        return self.wip.shuffleVector(a, b, llvm_mask_value, "");
     }
 
     /// Reduce a vector by repeatedly applying `llvm_fn` to produce an accumulated result.
src/codegen/spirv.zig
@@ -3252,7 +3252,8 @@ const NavGen = struct {
 
             .splat => try self.airSplat(inst),
             .reduce, .reduce_optimized => try self.airReduce(inst),
-            .shuffle                   => try self.airShuffle(inst),
+            .shuffle_one               => try self.airShuffleOne(inst),
+            .shuffle_two               => try self.airShuffleTwo(inst),
 
             .ptr_add => try self.airPtrAdd(inst),
             .ptr_sub => try self.airPtrSub(inst),
@@ -4047,40 +4048,57 @@ const NavGen = struct {
         return result_id;
     }
 
-    fn airShuffle(self: *NavGen, inst: Air.Inst.Index) !?IdRef {
-        const pt = self.pt;
+    fn airShuffleOne(ng: *NavGen, inst: Air.Inst.Index) !?IdRef {
+        const pt = ng.pt;
         const zcu = pt.zcu;
-        const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-        const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data;
-        const a = try self.resolve(extra.a);
-        const b = try self.resolve(extra.b);
-        const mask = Value.fromInterned(extra.mask);
+        const gpa = zcu.gpa;
 
-        // Note: number of components in the result, a, and b may differ.
-        const result_ty = self.typeOfIndex(inst);
-        const scalar_ty = result_ty.scalarType(zcu);
-        const scalar_ty_id = try self.resolveType(scalar_ty, .direct);
+        const unwrapped = ng.air.unwrapShuffleOne(zcu, inst);
+        const mask = unwrapped.mask;
+        const result_ty = unwrapped.result_ty;
+        const elem_ty = result_ty.childType(zcu);
+        const operand = try ng.resolve(unwrapped.operand);
 
-        const constituents = try self.gpa.alloc(IdRef, result_ty.vectorLen(zcu));
-        defer self.gpa.free(constituents);
+        const constituents = try gpa.alloc(IdRef, mask.len);
+        defer gpa.free(constituents);
 
-        for (constituents, 0..) |*id, i| {
-            const elem = try mask.elemValue(pt, i);
-            if (elem.isUndef(zcu)) {
-                id.* = try self.spv.constUndef(scalar_ty_id);
-                continue;
-            }
+        for (constituents, mask) |*id, mask_elem| {
+            id.* = switch (mask_elem.unwrap()) {
+                .elem => |idx| try ng.extractVectorComponent(elem_ty, operand, idx),
+                .value => |val| try ng.constant(elem_ty, .fromInterned(val), .direct),
+            };
+        }
 
-            const index = elem.toSignedInt(zcu);
-            if (index >= 0) {
-                id.* = try self.extractVectorComponent(scalar_ty, a, @intCast(index));
-            } else {
-                id.* = try self.extractVectorComponent(scalar_ty, b, @intCast(~index));
-            }
+        const result_ty_id = try ng.resolveType(result_ty, .direct);
+        return try ng.constructComposite(result_ty_id, constituents);
+    }
+
+    fn airShuffleTwo(ng: *NavGen, inst: Air.Inst.Index) !?IdRef {
+        const pt = ng.pt;
+        const zcu = pt.zcu;
+        const gpa = zcu.gpa;
+
+        const unwrapped = ng.air.unwrapShuffleTwo(zcu, inst);
+        const mask = unwrapped.mask;
+        const result_ty = unwrapped.result_ty;
+        const elem_ty = result_ty.childType(zcu);
+        const elem_ty_id = try ng.resolveType(elem_ty, .direct);
+        const operand_a = try ng.resolve(unwrapped.operand_a);
+        const operand_b = try ng.resolve(unwrapped.operand_b);
+
+        const constituents = try gpa.alloc(IdRef, mask.len);
+        defer gpa.free(constituents);
+
+        for (constituents, mask) |*id, mask_elem| {
+            id.* = switch (mask_elem.unwrap()) {
+                .a_elem => |idx| try ng.extractVectorComponent(elem_ty, operand_a, idx),
+                .b_elem => |idx| try ng.extractVectorComponent(elem_ty, operand_b, idx),
+                .undef => try ng.spv.constUndef(elem_ty_id),
+            };
         }
 
-        const result_ty_id = try self.resolveType(result_ty, .direct);
-        return try self.constructComposite(result_ty_id, constituents);
+        const result_ty_id = try ng.resolveType(result_ty, .direct);
+        return try ng.constructComposite(result_ty_id, constituents);
     }
 
     fn indicesToIds(self: *NavGen, indices: []const u32) ![]IdRef {
src/Zcu/PerThread.zig
@@ -1745,7 +1745,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A
         try air.legalize(pt, @import("../codegen.zig").legalizeFeatures(pt, nav_index) orelse break :legalize);
     }
 
-    var liveness = try Air.Liveness.analyze(gpa, air.*, ip);
+    var liveness = try Air.Liveness.analyze(zcu, air.*, ip);
     defer liveness.deinit(gpa);
 
     if (build_options.enable_debug_extensions and comp.verbose_air) {
@@ -1757,6 +1757,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A
     if (std.debug.runtime_safety) {
         var verify: Air.Liveness.Verify = .{
             .gpa = gpa,
+            .zcu = zcu,
             .air = air.*,
             .liveness = liveness,
             .intern_pool = ip,
src/Air.zig
@@ -699,9 +699,21 @@ pub const Inst = struct {
         /// equal to the scalar value.
         /// Uses the `ty_op` field.
         splat,
-        /// Constructs a vector by selecting elements from `a` and `b` based on `mask`.
-        /// Uses the `ty_pl` field with payload `Shuffle`.
-        shuffle,
+        /// Constructs a vector by selecting elements from a single vector based on a mask. Each
+        /// mask element is either an index into the vector, or a comptime-known value, or "undef".
+        /// Uses the `ty_pl` field, where the payload index points to:
+        /// 1. mask_elem: ShuffleOneMask  // for each `mask_len`, which comes from `ty_pl.ty`
+        /// 2. operand: Ref               // guaranteed not to be an interned value
+        /// See `unwrapShufleOne`.
+        shuffle_one,
+        /// Constructs a vector by selecting elements from two vectors based on a mask. Each mask
+        /// element is either an index into one of the vectors, or "undef".
+        /// Uses the `ty_pl` field, where the payload index points to:
+        /// 1. mask_elem: ShuffleOneMask  // for each `mask_len`, which comes from `ty_pl.ty`
+        /// 2. operand_a: Ref             // guaranteed not to be an interned value
+        /// 3. operand_b: Ref             // guaranteed not to be an interned value
+        /// See `unwrapShufleTwo`.
+        shuffle_two,
         /// Constructs a vector element-wise from `a` or `b` based on `pred`.
         /// Uses the `pl_op` field with `pred` as operand, and payload `Bin`.
         select,
@@ -1299,13 +1311,6 @@ pub const FieldParentPtr = struct {
     field_index: u32,
 };
 
-pub const Shuffle = struct {
-    a: Inst.Ref,
-    b: Inst.Ref,
-    mask: InternPool.Index,
-    mask_len: u32,
-};
-
 pub const VectorCmp = struct {
     lhs: Inst.Ref,
     rhs: Inst.Ref,
@@ -1320,6 +1325,64 @@ pub const VectorCmp = struct {
     }
 };
 
+/// Used by `Inst.Tag.shuffle_one`. Represents a mask element which either indexes into a
+/// runtime-known vector, or is a comptime-known value.
+pub const ShuffleOneMask = packed struct(u32) {
+    index: u31,
+    kind: enum(u1) { elem, value },
+    pub fn elem(idx: u32) ShuffleOneMask {
+        return .{ .index = @intCast(idx), .kind = .elem };
+    }
+    pub fn value(val: Value) ShuffleOneMask {
+        return .{ .index = @intCast(@intFromEnum(val.toIntern())), .kind = .value };
+    }
+    pub const Unwrapped = union(enum) {
+        /// The resulting element is this index into the runtime vector.
+        elem: u32,
+        /// The resulting element is this comptime-known value.
+        /// It is correctly typed. It might be `undefined`.
+        value: InternPool.Index,
+    };
+    pub fn unwrap(raw: ShuffleOneMask) Unwrapped {
+        return switch (raw.kind) {
+            .elem => .{ .elem = raw.index },
+            .value => .{ .value = @enumFromInt(raw.index) },
+        };
+    }
+};
+
+/// Used by `Inst.Tag.shuffle_two`. Represents a mask element which either indexes into one
+/// of two runtime-known vectors, or is undefined.
+pub const ShuffleTwoMask = enum(u32) {
+    undef = std.math.maxInt(u32),
+    _,
+    pub fn aElem(idx: u32) ShuffleTwoMask {
+        return @enumFromInt(idx << 1);
+    }
+    pub fn bElem(idx: u32) ShuffleTwoMask {
+        return @enumFromInt(idx << 1 | 1);
+    }
+    pub const Unwrapped = union(enum) {
+        /// The resulting element is this index into the first runtime vector.
+        a_elem: u32,
+        /// The resulting element is this index into the second runtime vector.
+        b_elem: u32,
+        /// The resulting element is `undefined`.
+        undef,
+    };
+    pub fn unwrap(raw: ShuffleTwoMask) Unwrapped {
+        switch (raw) {
+            .undef => return .undef,
+            _ => {},
+        }
+        const x = @intFromEnum(raw);
+        return switch (@as(u1, @truncate(x))) {
+            0 => .{ .a_elem = x >> 1 },
+            1 => .{ .b_elem = x >> 1 },
+        };
+    }
+};
+
 /// Trailing:
 /// 0. `Inst.Ref` for every outputs_len
 /// 1. `Inst.Ref` for every inputs_len
@@ -1503,7 +1566,6 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
         .cmpxchg_weak,
         .cmpxchg_strong,
         .slice,
-        .shuffle,
         .aggregate_init,
         .union_init,
         .field_parent_ptr,
@@ -1517,6 +1579,8 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
         .ptr_sub,
         .try_ptr,
         .try_ptr_cold,
+        .shuffle_one,
+        .shuffle_two,
         => return datas[@intFromEnum(inst)].ty_pl.ty.toType(),
 
         .not,
@@ -1903,7 +1967,8 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
         .reduce,
         .reduce_optimized,
         .splat,
-        .shuffle,
+        .shuffle_one,
+        .shuffle_two,
         .select,
         .is_named_enum_value,
         .tag_name,
@@ -2030,6 +2095,48 @@ pub fn unwrapSwitch(air: *const Air, switch_inst: Inst.Index) UnwrappedSwitch {
     };
 }
 
+pub fn unwrapShuffleOne(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index) struct {
+    result_ty: Type,
+    operand: Inst.Ref,
+    mask: []const ShuffleOneMask,
+} {
+    const inst = air.instructions.get(@intFromEnum(inst_index));
+    switch (inst.tag) {
+        .shuffle_one => {},
+        else => unreachable, // assertion failure
+    }
+    const result_ty: Type = .fromInterned(inst.data.ty_pl.ty.toInterned().?);
+    const mask_len: u32 = result_ty.vectorLen(zcu);
+    const extra_idx = inst.data.ty_pl.payload;
+    return .{
+        .result_ty = result_ty,
+        .operand = @enumFromInt(air.extra.items[extra_idx + mask_len]),
+        .mask = @ptrCast(air.extra.items[extra_idx..][0..mask_len]),
+    };
+}
+
+pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index) struct {
+    result_ty: Type,
+    operand_a: Inst.Ref,
+    operand_b: Inst.Ref,
+    mask: []const ShuffleTwoMask,
+} {
+    const inst = air.instructions.get(@intFromEnum(inst_index));
+    switch (inst.tag) {
+        .shuffle_two => {},
+        else => unreachable, // assertion failure
+    }
+    const result_ty: Type = .fromInterned(inst.data.ty_pl.ty.toInterned().?);
+    const mask_len: u32 = result_ty.vectorLen(zcu);
+    const extra_idx = inst.data.ty_pl.payload;
+    return .{
+        .result_ty = result_ty,
+        .operand_a = @enumFromInt(air.extra.items[extra_idx + mask_len + 0]),
+        .operand_b = @enumFromInt(air.extra.items[extra_idx + mask_len + 1]),
+        .mask = @ptrCast(air.extra.items[extra_idx..][0..mask_len]),
+    };
+}
+
 pub const typesFullyResolved = types_resolved.typesFullyResolved;
 pub const typeFullyResolved = types_resolved.checkType;
 pub const valFullyResolved = types_resolved.checkVal;
src/print_air.zig
@@ -315,7 +315,8 @@ const Writer = struct {
             .wasm_memory_grow => try w.writeWasmMemoryGrow(s, inst),
             .mul_add => try w.writeMulAdd(s, inst),
             .select => try w.writeSelect(s, inst),
-            .shuffle => try w.writeShuffle(s, inst),
+            .shuffle_one => try w.writeShuffleOne(s, inst),
+            .shuffle_two => try w.writeShuffleTwo(s, inst),
             .reduce, .reduce_optimized => try w.writeReduce(s, inst),
             .cmp_vector, .cmp_vector_optimized => try w.writeCmpVector(s, inst),
             .vector_store_elem => try w.writeVectorStoreElem(s, inst),
@@ -499,14 +500,39 @@ const Writer = struct {
         try w.writeOperand(s, inst, 2, pl_op.operand);
     }
 
-    fn writeShuffle(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
-        const ty_pl = w.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl;
-        const extra = w.air.extraData(Air.Shuffle, ty_pl.payload).data;
+    fn writeShuffleOne(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
+        const unwrapped = w.air.unwrapShuffleOne(w.pt.zcu, inst);
+        try w.writeType(s, unwrapped.result_ty);
+        try s.writeAll(", ");
+        try w.writeOperand(s, inst, 0, unwrapped.operand);
+        try s.writeAll(", [");
+        for (unwrapped.mask, 0..) |mask_elem, mask_idx| {
+            if (mask_idx > 0) try s.writeAll(", ");
+            switch (mask_elem.unwrap()) {
+                .elem => |idx| try s.print("elem {d}", .{idx}),
+                .value => |val| try s.print("val {}", .{Value.fromInterned(val).fmtValue(w.pt)}),
+            }
+        }
+        try s.writeByte(']');
+    }
 
-        try w.writeOperand(s, inst, 0, extra.a);
+    fn writeShuffleTwo(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
+        const unwrapped = w.air.unwrapShuffleTwo(w.pt.zcu, inst);
+        try w.writeType(s, unwrapped.result_ty);
+        try s.writeAll(", ");
+        try w.writeOperand(s, inst, 0, unwrapped.operand_a);
         try s.writeAll(", ");
-        try w.writeOperand(s, inst, 1, extra.b);
-        try s.print(", mask {d}, len {d}", .{ extra.mask, extra.mask_len });
+        try w.writeOperand(s, inst, 1, unwrapped.operand_b);
+        try s.writeAll(", [");
+        for (unwrapped.mask, 0..) |mask_elem, mask_idx| {
+            if (mask_idx > 0) try s.writeAll(", ");
+            switch (mask_elem.unwrap()) {
+                .a_elem => |idx| try s.print("a_elem {d}", .{idx}),
+                .b_elem => |idx| try s.print("b_elem {d}", .{idx}),
+                .undef => try s.writeAll("undef"),
+            }
+        }
+        try s.writeByte(']');
     }
 
     fn writeSelect(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
src/Sema.zig
@@ -24256,8 +24256,8 @@ fn analyzeShuffle(
     block: *Block,
     src_node: std.zig.Ast.Node.Offset,
     elem_ty: Type,
-    a_arg: Air.Inst.Ref,
-    b_arg: Air.Inst.Ref,
+    a_uncoerced: Air.Inst.Ref,
+    b_uncoerced: Air.Inst.Ref,
     mask: Value,
     mask_len: u32,
 ) CompileError!Air.Inst.Ref {
@@ -24266,150 +24266,154 @@ fn analyzeShuffle(
     const a_src = block.builtinCallArgSrc(src_node, 1);
     const b_src = block.builtinCallArgSrc(src_node, 2);
     const mask_src = block.builtinCallArgSrc(src_node, 3);
-    var a = a_arg;
-    var b = b_arg;
 
-    const res_ty = try pt.vectorType(.{
-        .len = mask_len,
-        .child = elem_ty.toIntern(),
-    });
-
-    const maybe_a_len = switch (sema.typeOf(a).zigTypeTag(zcu)) {
-        .array, .vector => sema.typeOf(a).arrayLen(zcu),
-        .undefined => null,
-        else => return sema.fail(block, a_src, "expected vector or array with element type '{}', found '{}'", .{
-            elem_ty.fmt(pt),
-            sema.typeOf(a).fmt(pt),
-        }),
-    };
-    const maybe_b_len = switch (sema.typeOf(b).zigTypeTag(zcu)) {
-        .array, .vector => sema.typeOf(b).arrayLen(zcu),
-        .undefined => null,
-        else => return sema.fail(block, b_src, "expected vector or array with element type '{}', found '{}'", .{
-            elem_ty.fmt(pt),
-            sema.typeOf(b).fmt(pt),
-        }),
-    };
-    if (maybe_a_len == null and maybe_b_len == null) {
-        return pt.undefRef(res_ty);
-    }
-    const a_len: u32 = @intCast(maybe_a_len orelse maybe_b_len.?);
-    const b_len: u32 = @intCast(maybe_b_len orelse a_len);
-
-    const a_ty = try pt.vectorType(.{
-        .len = a_len,
-        .child = elem_ty.toIntern(),
-    });
-    const b_ty = try pt.vectorType(.{
-        .len = b_len,
-        .child = elem_ty.toIntern(),
-    });
-
-    if (maybe_a_len == null) a = try pt.undefRef(a_ty) else a = try sema.coerce(block, a_ty, a, a_src);
-    if (maybe_b_len == null) b = try pt.undefRef(b_ty) else b = try sema.coerce(block, b_ty, b, b_src);
-
-    const operand_info = [2]std.meta.Tuple(&.{ u64, LazySrcLoc, Type }){
-        .{ a_len, a_src, a_ty },
-        .{ b_len, b_src, b_ty },
-    };
-
-    for (0..@intCast(mask_len)) |i| {
-        const elem = try mask.elemValue(pt, i);
-        if (elem.isUndef(zcu)) continue;
-        const elem_resolved = try sema.resolveLazyValue(elem);
-        const int = elem_resolved.toSignedInt(zcu);
-        var unsigned: u32 = undefined;
-        var chosen: u32 = undefined;
-        if (int >= 0) {
-            unsigned = @intCast(int);
-            chosen = 0;
-        } else {
-            unsigned = @intCast(~int);
-            chosen = 1;
+    // If the type of `a` is `@Type(.undefined)`, i.e. the argument is untyped, this is 0, because it is an error to index into this vector.
+    const a_len: u32 = switch (sema.typeOf(a_uncoerced).zigTypeTag(zcu)) {
+        .array, .vector => @intCast(sema.typeOf(a_uncoerced).arrayLen(zcu)),
+        .undefined => 0,
+        else => return sema.fail(block, a_src, "expected vector of '{}', found '{}'", .{ elem_ty.fmt(pt), sema.typeOf(a_uncoerced).fmt(pt) }),
+    };
+    const a_ty = try pt.vectorType(.{ .len = a_len, .child = elem_ty.toIntern() });
+    const a_coerced = try sema.coerce(block, a_ty, a_uncoerced, a_src);
+
+    // If the type of `b` is `@Type(.undefined)`, i.e. the argument is untyped, this is 0, because it is an error to index into this vector.
+    const b_len: u32 = switch (sema.typeOf(b_uncoerced).zigTypeTag(zcu)) {
+        .array, .vector => @intCast(sema.typeOf(b_uncoerced).arrayLen(zcu)),
+        .undefined => 0,
+        else => return sema.fail(block, b_src, "expected vector of '{}', found '{}'", .{ elem_ty.fmt(pt), sema.typeOf(b_uncoerced).fmt(pt) }),
+    };
+    const b_ty = try pt.vectorType(.{ .len = b_len, .child = elem_ty.toIntern() });
+    const b_coerced = try sema.coerce(block, b_ty, b_uncoerced, b_src);
+
+    const result_ty = try pt.vectorType(.{ .len = mask_len, .child = elem_ty.toIntern() });
+
+    // We're going to pre-emptively reserve space in `sema.air_extra`. The reason for this is we need
+    // a `u32` buffer of length `mask_len` anyway, and putting it in `sema.air_extra` avoids a copy
+    // in the runtime case. If the result is comptime-known, we'll shrink `air_extra` back.
+    const air_extra_idx: u32 = @intCast(sema.air_extra.items.len);
+    const air_mask_buf = try sema.air_extra.addManyAsSlice(sema.gpa, mask_len);
+
+    // We want to interpret that buffer in `air_extra` in a few ways. Initially, we'll consider its
+    // elements as `Air.Inst.ShuffleTwoMask`, essentially representing the raw mask values; then, we'll
+    // convert it to `InternPool.Index` or `Air.Inst.ShuffleOneMask` if there are comptime-known operands.
+    const mask_ip_index: []InternPool.Index = @ptrCast(air_mask_buf);
+    const mask_shuffle_one: []Air.ShuffleOneMask = @ptrCast(air_mask_buf);
+    const mask_shuffle_two: []Air.ShuffleTwoMask = @ptrCast(air_mask_buf);
+
+    // Initial loop: check mask elements, populate `mask_shuffle_two`.
+    var a_used = false;
+    var b_used = false;
+    for (mask_shuffle_two, 0..mask_len) |*out, mask_idx| {
+        const mask_val = try mask.elemValue(pt, mask_idx);
+        if (mask_val.isUndef(zcu)) {
+            out.* = .undef;
+            continue;
         }
-        if (unsigned >= operand_info[chosen][0]) {
-            const msg = msg: {
-                const msg = try sema.errMsg(mask_src, "mask index '{d}' has out-of-bounds selection", .{i});
+        // Safe because mask elements are `i32` and we already checked for undef:
+        const raw = (try sema.resolveLazyValue(mask_val)).toSignedInt(zcu);
+        if (raw >= 0) {
+            const idx: u32 = @intCast(raw);
+            a_used = true;
+            out.* = .aElem(idx);
+            if (idx >= a_len) return sema.failWithOwnedErrorMsg(block, msg: {
+                const msg = try sema.errMsg(mask_src, "mask element at index '{d}' selects out-of-bounds index", .{mask_idx});
                 errdefer msg.destroy(sema.gpa);
-
-                try sema.errNote(operand_info[chosen][1], msg, "selected index '{d}' out of bounds of '{}'", .{
-                    unsigned,
-                    operand_info[chosen][2].fmt(pt),
-                });
-
-                if (chosen == 0) {
-                    try sema.errNote(b_src, msg, "selections from the second vector are specified with negative numbers", .{});
+                try sema.errNote(a_src, msg, "index '{d}' exceeds bounds of '{}' given here", .{ idx, a_ty.fmt(pt) });
+                if (idx < b_len) {
+                    try sema.errNote(b_src, msg, "use '~@as(u32, {d})' to index into second vector given here", .{idx});
                 }
-
                 break :msg msg;
-            };
-            return sema.failWithOwnedErrorMsg(block, msg);
+            });
+        } else {
+            const idx: u32 = @intCast(~raw);
+            b_used = true;
+            out.* = .bElem(idx);
+            if (idx >= b_len) return sema.failWithOwnedErrorMsg(block, msg: {
+                const msg = try sema.errMsg(mask_src, "mask element at index '{d}' selects out-of-bounds index", .{mask_idx});
+                errdefer msg.destroy(sema.gpa);
+                try sema.errNote(b_src, msg, "index '{d}' exceeds bounds of '{}' given here", .{ idx, b_ty.fmt(pt) });
+                break :msg msg;
+            });
         }
     }
 
-    if (try sema.resolveValue(a)) |a_val| {
-        if (try sema.resolveValue(b)) |b_val| {
-            const values = try sema.arena.alloc(InternPool.Index, mask_len);
-            for (values, 0..) |*value, i| {
-                const mask_elem_val = try mask.elemValue(pt, i);
-                if (mask_elem_val.isUndef(zcu)) {
-                    value.* = try pt.intern(.{ .undef = elem_ty.toIntern() });
-                    continue;
-                }
-                const int = mask_elem_val.toSignedInt(zcu);
-                const unsigned: u32 = @intCast(if (int >= 0) int else ~int);
-                values[i] = (try (if (int >= 0) a_val else b_val).elemValue(pt, unsigned)).toIntern();
-            }
-            return Air.internedToRef((try pt.intern(.{ .aggregate = .{
-                .ty = res_ty.toIntern(),
-                .storage = .{ .elems = values },
-            } })));
-        }
-    }
+    const maybe_a_val = try sema.resolveValue(a_coerced);
+    const maybe_b_val = try sema.resolveValue(b_coerced);
 
-    // All static analysis passed, and not comptime.
-    // For runtime codegen, vectors a and b must be the same length. Here we
-    // recursively @shuffle the smaller vector to append undefined elements
-    // to it up to the length of the longer vector. This recursion terminates
-    // in 1 call because these calls to analyzeShuffle guarantee a_len == b_len.
-    if (a_len != b_len) {
-        const min_len = @min(a_len, b_len);
-        const max_src = if (a_len > b_len) a_src else b_src;
-        const max_len = try sema.usizeCast(block, max_src, @max(a_len, b_len));
+    const a_rt = a_used and maybe_a_val == null;
+    const b_rt = b_used and maybe_b_val == null;
 
-        const expand_mask_values = try sema.arena.alloc(InternPool.Index, max_len);
-        for (@intCast(0)..@intCast(min_len)) |i| {
-            expand_mask_values[i] = (try pt.intValue(.comptime_int, i)).toIntern();
+    if (a_rt and b_rt) {
+        // Both operands are needed and runtime-known. We need a `[]ShuffleTwomask`... which is
+        // exactly what we already have in `mask_shuffle_two`! So, we're basically done already.
+        // We just need to append the two operands.
+        try sema.air_extra.ensureUnusedCapacity(sema.gpa, 2);
+        sema.appendRefsAssumeCapacity(&.{ a_coerced, b_coerced });
+        return block.addInst(.{
+            .tag = .shuffle_two,
+            .data = .{ .ty_pl = .{
+                .ty = Air.internedToRef(result_ty.toIntern()),
+                .payload = air_extra_idx,
+            } },
+        });
+    } else if (a_rt) {
+        // We need to convert the `ShuffleTwoMask` values to `ShuffleOneMask`.
+        for (mask_shuffle_two, mask_shuffle_one) |in, *out| {
+            out.* = switch (in.unwrap()) {
+                .undef => .value(try pt.undefValue(elem_ty)),
+                .a_elem => |idx| .elem(idx),
+                .b_elem => |idx| .value(try maybe_b_val.?.elemValue(pt, idx)),
+            };
         }
-        for (@intCast(min_len)..@intCast(max_len)) |i| {
-            expand_mask_values[i] = .negative_one;
+        // Now just append our single runtime operand, and we're done.
+        try sema.air_extra.ensureUnusedCapacity(sema.gpa, 1);
+        sema.appendRefsAssumeCapacity(&.{a_coerced});
+        return block.addInst(.{
+            .tag = .shuffle_one,
+            .data = .{ .ty_pl = .{
+                .ty = Air.internedToRef(result_ty.toIntern()),
+                .payload = air_extra_idx,
+            } },
+        });
+    } else if (b_rt) {
+        // We need to convert the `ShuffleTwoMask` values to `ShuffleOneMask`.
+        for (mask_shuffle_two, mask_shuffle_one) |in, *out| {
+            out.* = switch (in.unwrap()) {
+                .undef => .value(try pt.undefValue(elem_ty)),
+                .a_elem => |idx| .value(try maybe_a_val.?.elemValue(pt, idx)),
+                .b_elem => |idx| .elem(idx),
+            };
         }
-        const expand_mask = try pt.intern(.{ .aggregate = .{
-            .ty = (try pt.vectorType(.{ .len = @intCast(max_len), .child = .comptime_int_type })).toIntern(),
-            .storage = .{ .elems = expand_mask_values },
-        } });
-
-        if (a_len < b_len) {
-            const undef = try pt.undefRef(a_ty);
-            a = try sema.analyzeShuffle(block, src_node, elem_ty, a, undef, Value.fromInterned(expand_mask), @intCast(max_len));
-        } else {
-            const undef = try pt.undefRef(b_ty);
-            b = try sema.analyzeShuffle(block, src_node, elem_ty, b, undef, Value.fromInterned(expand_mask), @intCast(max_len));
+        // Now just append our single runtime operand, and we're done.
+        try sema.air_extra.ensureUnusedCapacity(sema.gpa, 1);
+        sema.appendRefsAssumeCapacity(&.{b_coerced});
+        return block.addInst(.{
+            .tag = .shuffle_one,
+            .data = .{ .ty_pl = .{
+                .ty = Air.internedToRef(result_ty.toIntern()),
+                .payload = air_extra_idx,
+            } },
+        });
+    } else {
+        // The result will be comptime-known. We must convert the `ShuffleTwoMask` values to
+        // `InternPool.Index` values using the known operands.
+        for (mask_shuffle_two, mask_ip_index) |in, *out| {
+            const val: Value = switch (in.unwrap()) {
+                .undef => try pt.undefValue(elem_ty),
+                .a_elem => |idx| try maybe_a_val.?.elemValue(pt, idx),
+                .b_elem => |idx| try maybe_b_val.?.elemValue(pt, idx),
+            };
+            out.* = val.toIntern();
         }
+        const res = try pt.intern(.{ .aggregate = .{
+            .ty = result_ty.toIntern(),
+            .storage = .{ .elems = mask_ip_index },
+        } });
+        // We have a comptime-known result, so didn't need `air_mask_buf` -- remove it from `sema.air_extra`.
+        assert(sema.air_extra.items.len == air_extra_idx + air_mask_buf.len);
+        sema.air_extra.shrinkRetainingCapacity(air_extra_idx);
+        return Air.internedToRef(res);
     }
-
-    return block.addInst(.{
-        .tag = .shuffle,
-        .data = .{ .ty_pl = .{
-            .ty = Air.internedToRef(res_ty.toIntern()),
-            .payload = try block.sema.addExtra(Air.Shuffle{
-                .a = a,
-                .b = b,
-                .mask = mask.toIntern(),
-                .mask_len = mask_len,
-            }),
-        } },
-    });
 }
 
 fn zirSelect(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData) CompileError!Air.Inst.Ref {
test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig
@@ -1,14 +1,20 @@
-export fn entry() void {
-    const v: @Vector(4, u32) = [4]u32{ 10, 11, 12, 13 };
-    const x: @Vector(4, u32) = [4]u32{ 14, 15, 16, 17 };
-    const z = @shuffle(u32, v, x, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 });
-    _ = z;
+export fn foo() void {
+    // Here, the bad index ('7') is not less than 'b.len', so the error shouldn't have a note suggesting a negative index.
+    const a: @Vector(4, u32) = .{ 10, 11, 12, 13 };
+    const b: @Vector(4, u32) = .{ 14, 15, 16, 17 };
+    _ = @shuffle(u32, a, b, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 });
+}
+export fn bar() void {
+    // Here, the bad index ('7') *is* less than 'b.len', so the error *should* have a note suggesting a negative index.
+    const a: @Vector(4, u32) = .{ 10, 11, 12, 13 };
+    const b: @Vector(9, u32) = .{ 14, 15, 16, 17, 18, 19, 20, 21, 22 };
+    _ = @shuffle(u32, a, b, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 });
 }
 
 // error
-// backend=stage2
-// target=native
 //
-// :4:41: error: mask index '4' has out-of-bounds selection
-// :4:29: note: selected index '7' out of bounds of '@Vector(4, u32)'
-// :4:32: note: selections from the second vector are specified with negative numbers
+// :5:35: error: mask element at index '4' selects out-of-bounds index
+// :5:23: note: index '7' exceeds bounds of '@Vector(4, u32)' given here
+// :11:35: error: mask element at index '4' selects out-of-bounds index
+// :11:23: note: index '7' exceeds bounds of '@Vector(4, u32)' given here
+// :11:26: note: use '~@as(u32, 7)' to index into second vector given here