Commit 4fcc750ba5

Jacob Young <jacobly0@users.noreply.github.com>
2024-02-17 09:24:08
x86_64: implement more shuffles
1 parent e5c439a
Changed files (5)
lib/std/crypto/blake3.zig
@@ -200,8 +200,7 @@ const CompressGeneric = struct {
     }
 };
 
-const compress = if (builtin.cpu.arch == .x86_64 and
-    (builtin.zig_backend != .stage2_x86_64 or std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)))
+const compress = if (builtin.cpu.arch == .x86_64)
     CompressVectorized.compress
 else
     CompressGeneric.compress;
lib/std/crypto/salsa20.zig
@@ -302,8 +302,7 @@ fn SalsaNonVecImpl(comptime rounds: comptime_int) type {
     };
 }
 
-const SalsaImpl = if (builtin.cpu.arch == .x86_64 and
-    (builtin.zig_backend != .stage2_x86_64 or std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)))
+const SalsaImpl = if (builtin.cpu.arch == .x86_64)
     SalsaVecImpl
 else
     SalsaNonVecImpl;
src/arch/x86_64/CodeGen.zig
@@ -4799,7 +4799,7 @@ fn airShlShrBinOp(self: *Self, inst: Air.Inst.Index) !void {
         switch (lhs_ty.zigTypeTag(mod)) {
             .Int => {
                 try self.spillRegisters(&.{.rcx});
-                try self.register_manager.getReg(.rcx, null);
+                try self.register_manager.getKnownReg(.rcx, null);
                 const lhs_mcv = try self.resolveInst(bin_op.lhs);
                 const rhs_mcv = try self.resolveInst(bin_op.rhs);
 
@@ -6825,7 +6825,7 @@ fn floatSign(self: *Self, inst: Air.Inst.Index, operand: Air.Inst.Ref, ty: Type)
 
             const dst_mcv: MCValue = .{ .register = .st0 };
             if (!std.meta.eql(src_mcv, dst_mcv) or !self.reuseOperand(inst, operand, 0, src_mcv))
-                try self.register_manager.getReg(.st0, inst);
+                try self.register_manager.getKnownReg(.st0, inst);
 
             try self.genCopy(ty, dst_mcv, src_mcv, .{});
             switch (tag) {
@@ -9059,7 +9059,7 @@ fn genShiftBinOp(
         lhs_ty.fmt(mod),
     });
 
-    try self.register_manager.getReg(.rcx, null);
+    try self.register_manager.getKnownReg(.rcx, null);
     const rcx_lock = self.register_manager.lockReg(.rcx);
     defer if (rcx_lock) |lock| self.register_manager.unlockReg(lock);
 
@@ -9382,8 +9382,8 @@ fn genMulDivBinOp(
                 .rem => maybe_inst,
                 else => null,
             };
-            try self.register_manager.getReg(.rax, track_inst_rax);
-            try self.register_manager.getReg(.rdx, track_inst_rdx);
+            try self.register_manager.getKnownReg(.rax, track_inst_rax);
+            try self.register_manager.getKnownReg(.rdx, track_inst_rdx);
 
             try self.genIntMulDivOpMir(switch (signedness) {
                 .signed => switch (tag) {
@@ -9423,8 +9423,11 @@ fn genMulDivBinOp(
         },
 
         .mod => {
-            try self.register_manager.getReg(.rax, null);
-            try self.register_manager.getReg(.rdx, if (signedness == .unsigned) maybe_inst else null);
+            try self.register_manager.getKnownReg(.rax, null);
+            try self.register_manager.getKnownReg(
+                .rdx,
+                if (signedness == .unsigned) maybe_inst else null,
+            );
 
             switch (signedness) {
                 .signed => {
@@ -9465,8 +9468,11 @@ fn genMulDivBinOp(
         },
 
         .div_floor => {
-            try self.register_manager.getReg(.rax, if (signedness == .unsigned) maybe_inst else null);
-            try self.register_manager.getReg(.rdx, null);
+            try self.register_manager.getKnownReg(
+                .rax,
+                if (signedness == .unsigned) maybe_inst else null,
+            );
+            try self.register_manager.getKnownReg(.rdx, null);
 
             const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
                 .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
@@ -9710,7 +9716,7 @@ fn genBinOp(
         .rem, .mod => unreachable,
         .max, .min => if (lhs_ty.scalarType(mod).isRuntimeFloat()) registerAlias(
             if (!self.hasFeature(.avx) and self.hasFeature(.sse4_1)) mask: {
-                try self.register_manager.getReg(.xmm0, null);
+                try self.register_manager.getKnownReg(.xmm0, null);
                 break :mask .xmm0;
             } else try self.register_manager.allocReg(null, abi.RegisterClass.sse),
             abi_size,
@@ -16769,7 +16775,7 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
                 .register => |pred_reg| switch (pred_reg.class()) {
                     .general_purpose => {},
                     .sse => if (need_xmm0 and pred_reg.id() != comptime Register.xmm0.id()) {
-                        try self.register_manager.getReg(.xmm0, null);
+                        try self.register_manager.getKnownReg(.xmm0, null);
                         try self.genSetReg(.xmm0, pred_ty, pred_mcv, .{});
                         break :mask .xmm0;
                     } else break :mask if (has_blend)
@@ -16781,7 +16787,7 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
                 else => {},
             }
             const mask_reg: Register = if (need_xmm0) mask_reg: {
-                try self.register_manager.getReg(.xmm0, null);
+                try self.register_manager.getKnownReg(.xmm0, null);
                 break :mask_reg .xmm0;
             } else try self.register_manager.allocReg(null, abi.RegisterClass.sse);
             const mask_alias = registerAlias(mask_reg, abi_size);
@@ -17093,7 +17099,7 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
 
     const dst_ty = self.typeOfIndex(inst);
     const elem_ty = dst_ty.childType(mod);
-    const elem_abi_size: u32 = @intCast(elem_ty.abiSize(mod));
+    const elem_abi_size: u16 = @intCast(elem_ty.abiSize(mod));
     const dst_abi_size: u32 = @intCast(dst_ty.abiSize(mod));
     const lhs_ty = self.typeOf(extra.a);
     const lhs_abi_size: u32 = @intCast(lhs_ty.abiSize(mod));
@@ -17109,8 +17115,8 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
     const mask_elems = try allocator.alloc(?i32, extra.mask_len);
     defer allocator.free(mask_elems);
     for (mask_elems, 0..) |*mask_elem, elem_index| {
-        const mask_elem_val = Value
-            .fromInterned(extra.mask).elemValue(mod, elem_index) catch unreachable;
+        const mask_elem_val =
+            Value.fromInterned(extra.mask).elemValue(mod, elem_index) catch unreachable;
         mask_elem.* = if (mask_elem_val.isUndef(mod))
             null
         else
@@ -17124,7 +17130,7 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
         } else break :result try self.allocRegOrMem(inst, true);
 
         for (mask_elems, 0..) |mask_elem, elem_index| {
-            if (mask_elem orelse continue != @as(i32, @intCast(elem_index))) break;
+            if (mask_elem orelse continue != elem_index) break;
         } else {
             const lhs_mcv = try self.resolveInst(extra.a);
             if (self.reuseOperand(inst, extra.a, 0, lhs_mcv)) break :result lhs_mcv;
@@ -17134,7 +17140,7 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
         }
 
         for (mask_elems, 0..) |mask_elem, elem_index| {
-            if (mask_elem orelse continue != ~@as(i32, @intCast(elem_index))) break;
+            if (~(mask_elem orelse continue) != elem_index) break;
         } else {
             const rhs_mcv = try self.resolveInst(extra.b);
             if (self.reuseOperand(inst, extra.b, 1, rhs_mcv)) break :result rhs_mcv;
@@ -17143,6 +17149,219 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
             break :result dst_mcv;
         }
 
+        for ([_]Mir.Inst.Tag{ .unpckl, .unpckh }) |variant| unpck: {
+            if (elem_abi_size > 8) break :unpck;
+            if (dst_abi_size > @as(u32, if (if (elem_abi_size >= 4)
+                has_avx
+            else
+                self.hasFeature(.avx2)) 32 else 16)) break :unpck;
+
+            var sources = [1]?u1{null} ** 2;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index =
+                    math.cast(u5, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :unpck;
+                const elem_byte = (elem_index >> 1) * elem_abi_size;
+                if (mask_elem_index * elem_abi_size != (elem_byte & 0b0111) | @as(u4, switch (variant) {
+                    .unpckl => 0b0000,
+                    .unpckh => 0b1000,
+                    else => unreachable,
+                }) | (elem_byte << 1 & 0b10000)) break :unpck;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[elem_index & 0b00001]) |prev_source| {
+                    if (source != prev_source) break :unpck;
+                } else sources[elem_index & 0b00001] = source;
+            }
+            if (sources[0] orelse break :unpck == sources[1] orelse break :unpck) break :unpck;
+
+            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
+            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const lhs_mcv = try self.resolveInst(operands[sources[0].?]);
+            const rhs_mcv = try self.resolveInst(operands[sources[1].?]);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            const mir_tag: Mir.Inst.FixedTag = if ((elem_abi_size >= 4 and elem_ty.isRuntimeFloat()) or
+                (dst_abi_size > 16 and !self.hasFeature(.avx2))) .{ switch (elem_abi_size) {
+                4 => if (has_avx) .v_ps else ._ps,
+                8 => if (has_avx) .v_pd else ._pd,
+                else => unreachable,
+            }, variant } else .{ if (has_avx) .vp_ else .p_, switch (variant) {
+                .unpckl => switch (elem_abi_size) {
+                    1 => .unpcklbw,
+                    2 => .unpcklwd,
+                    4 => .unpckldq,
+                    8 => .unpcklqdq,
+                    else => unreachable,
+                },
+                .unpckh => switch (elem_abi_size) {
+                    1 => .unpckhbw,
+                    2 => .unpckhwd,
+                    4 => .unpckhdq,
+                    8 => .unpckhqdq,
+                    else => unreachable,
+                },
+                else => unreachable,
+            } };
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+                mir_tag,
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+            ) else try self.asmRegisterRegisterRegister(
+                mir_tag,
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemory(
+                mir_tag,
+                dst_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+            ) else try self.asmRegisterRegister(
+                mir_tag,
+                dst_alias,
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+            );
+            break :result dst_mcv;
+        }
+
+        pshufd: {
+            if (elem_abi_size != 4) break :pshufd;
+            if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :pshufd;
+
+            var control: u8 = 0b00_00_00_00;
+            var sources = [1]?u1{null} ** 1;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index: u3 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem);
+                if (mask_elem_index & 0b100 != elem_index & 0b100) break :pshufd;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[0]) |prev_source| {
+                    if (source != prev_source) break :pshufd;
+                } else sources[(elem_index & 0b010) >> 1] = source;
+
+                const select_bit: u3 = @intCast((elem_index & 0b011) << 1);
+                const select = @as(u8, @intCast(mask_elem_index & 0b011)) << select_bit;
+                if (elem_index & 0b100 == 0)
+                    control |= select
+                else if (control & @as(u8, 0b11) << select_bit != select) break :pshufd;
+            }
+
+            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
+            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const src_mcv = try self.resolveInst(operands[sources[0] orelse break :pshufd]);
+
+            const dst_reg = if (src_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, src_mcv))
+                src_mcv.getReg().?
+            else
+                try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                .{ if (has_avx) .vp_d else .p_d, .shuf },
+                dst_alias,
+                try src_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterImmediate(
+                .{ if (has_avx) .vp_d else .p_d, .shuf },
+                dst_alias,
+                registerAlias(if (src_mcv.isRegister())
+                    src_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[0].?], src_mcv), max_abi_size),
+                Immediate.u(control),
+            );
+            break :result .{ .register = dst_reg };
+        }
+
+        shufps: {
+            if (elem_abi_size != 4) break :shufps;
+            if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :shufps;
+
+            var control: u8 = 0b00_00_00_00;
+            var sources = [1]?u1{null} ** 2;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index: u3 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem);
+                if (mask_elem_index & 0b100 != elem_index & 0b100) break :shufps;
+
+                const source = @intFromBool(mask_elem < 0);
+                if (sources[(elem_index & 0b010) >> 1]) |prev_source| {
+                    if (source != prev_source) break :shufps;
+                } else sources[(elem_index & 0b010) >> 1] = source;
+
+                const select_bit: u3 = @intCast((elem_index & 0b011) << 1);
+                const select = @as(u8, @intCast(mask_elem_index & 0b011)) << select_bit;
+                if (elem_index & 0b100 == 0)
+                    control |= select
+                else if (control & @as(u8, 0b11) << select_bit != select) break :shufps;
+            }
+            if (sources[0] orelse break :shufps == sources[1] orelse break :shufps) break :shufps;
+
+            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
+            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const lhs_mcv = try self.resolveInst(operands[sources[0].?]);
+            const rhs_mcv = try self.resolveInst(operands[sources[1].?]);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, max_abi_size);
+
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                .{ .v_ps, .shuf },
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterRegisterImmediate(
+                .{ .v_ps, .shuf },
+                dst_alias,
+                registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+                Immediate.u(control),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                .{ ._ps, .shuf },
+                dst_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)),
+                Immediate.u(control),
+            ) else try self.asmRegisterRegisterImmediate(
+                .{ ._ps, .shuf },
+                dst_alias,
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size),
+                Immediate.u(control),
+            );
+            break :result dst_mcv;
+        }
+
         shufpd: {
             if (elem_abi_size != 8) break :shufpd;
             if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :shufpd;
@@ -17163,8 +17382,8 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
             }
             if (sources[0] orelse break :shufpd == sources[1] orelse break :shufpd) break :shufpd;
 
-            const operands = [2]Air.Inst.Ref{ extra.a, extra.b };
-            const operand_tys = [2]Type{ lhs_ty, rhs_ty };
+            const operands: [2]Air.Inst.Ref = .{ extra.a, extra.b };
+            const operand_tys: [2]Type = .{ lhs_ty, rhs_ty };
             const lhs_mcv = try self.resolveInst(operands[sources[0].?]);
             const rhs_mcv = try self.resolveInst(operands[sources[1].?]);
 
@@ -17210,6 +17429,355 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
             break :result dst_mcv;
         }
 
+        blend: {
+            if (elem_abi_size < 2) break :blend;
+            if (dst_abi_size > @as(u32, if (has_avx) 32 else 16)) break :blend;
+            if (!self.hasFeature(.sse4_1)) break :blend;
+
+            var control: u8 = 0b0_0_0_0_0_0_0_0;
+            for (mask_elems, 0..) |maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index =
+                    math.cast(u4, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :blend;
+                if (mask_elem_index != elem_index) break :blend;
+
+                const select = @as(u8, @intFromBool(mask_elem < 0)) << @truncate(elem_index);
+                if (elem_index & 0b1000 == 0)
+                    control |= select
+                else if (control & @as(u8, 0b1) << @truncate(elem_index) != select) break :blend;
+            }
+
+            if (!elem_ty.isRuntimeFloat() and self.hasFeature(.avx2)) vpblendd: {
+                const expanded_control = switch (elem_abi_size) {
+                    4 => control,
+                    8 => @as(u8, if (control & 0b0001 != 0) 0b00_00_00_11 else 0b00_00_00_00) |
+                        @as(u8, if (control & 0b0010 != 0) 0b00_00_11_00 else 0b00_00_00_00) |
+                        @as(u8, if (control & 0b0100 != 0) 0b00_11_00_00 else 0b00_00_00_00) |
+                        @as(u8, if (control & 0b1000 != 0) 0b11_00_00_00 else 0b00_00_00_00),
+                    else => break :vpblendd,
+                };
+
+                const lhs_mcv = try self.resolveInst(extra.a);
+                const lhs_reg = if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, lhs_mcv);
+                const lhs_lock = self.register_manager.lockReg(lhs_reg);
+                defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+                const rhs_mcv = try self.resolveInst(extra.b);
+                const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
+                if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                    .{ .vp_d, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(lhs_reg, dst_abi_size),
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    Immediate.u(expanded_control),
+                ) else try self.asmRegisterRegisterRegisterImmediate(
+                    .{ .vp_d, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(lhs_reg, dst_abi_size),
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    Immediate.u(expanded_control),
+                );
+                break :result .{ .register = dst_reg };
+            }
+
+            if (!elem_ty.isRuntimeFloat() or elem_abi_size == 2) pblendw: {
+                const expanded_control = switch (elem_abi_size) {
+                    2 => control,
+                    4 => if (dst_abi_size <= 16 or
+                        @as(u4, @intCast(control >> 4)) == @as(u4, @truncate(control >> 0)))
+                        @as(u8, if (control & 0b0001 != 0) 0b00_00_00_11 else 0b00_00_00_00) |
+                            @as(u8, if (control & 0b0010 != 0) 0b00_00_11_00 else 0b00_00_00_00) |
+                            @as(u8, if (control & 0b0100 != 0) 0b00_11_00_00 else 0b00_00_00_00) |
+                            @as(u8, if (control & 0b1000 != 0) 0b11_00_00_00 else 0b00_00_00_00)
+                    else
+                        break :pblendw,
+                    8 => if (dst_abi_size <= 16 or
+                        @as(u2, @intCast(control >> 2)) == @as(u2, @truncate(control >> 0)))
+                        @as(u8, if (control & 0b01 != 0) 0b0000_1111 else 0b0000_0000) |
+                            @as(u8, if (control & 0b10 != 0) 0b1111_0000 else 0b0000_0000)
+                    else
+                        break :pblendw,
+                    16 => break :pblendw,
+                    else => unreachable,
+                };
+
+                const lhs_mcv = try self.resolveInst(extra.a);
+                const rhs_mcv = try self.resolveInst(extra.b);
+
+                const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                    self.reuseOperand(inst, extra.a, 0, lhs_mcv))
+                    lhs_mcv
+                else if (has_avx and lhs_mcv.isRegister())
+                    .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+                else
+                    try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv);
+                const dst_reg = dst_mcv.getReg().?;
+
+                if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                    .{ .vp_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(if (lhs_mcv.isRegister())
+                        lhs_mcv.getReg().?
+                    else
+                        dst_reg, dst_abi_size),
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    Immediate.u(expanded_control),
+                ) else try self.asmRegisterRegisterRegisterImmediate(
+                    .{ .vp_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(if (lhs_mcv.isRegister())
+                        lhs_mcv.getReg().?
+                    else
+                        dst_reg, dst_abi_size),
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    Immediate.u(expanded_control),
+                ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                    .{ .p_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    Immediate.u(expanded_control),
+                ) else try self.asmRegisterRegisterImmediate(
+                    .{ .p_w, .blend },
+                    registerAlias(dst_reg, dst_abi_size),
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    Immediate.u(expanded_control),
+                );
+                break :result .{ .register = dst_reg };
+            }
+
+            const expanded_control = switch (elem_abi_size) {
+                4, 8 => control,
+                16 => @as(u4, if (control & 0b01 != 0) 0b00_11 else 0b00_00) |
+                    @as(u4, if (control & 0b10 != 0) 0b11_00 else 0b00_00),
+                else => unreachable,
+            };
+
+            const lhs_mcv = try self.resolveInst(extra.a);
+            const rhs_mcv = try self.resolveInst(extra.b);
+
+            const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                self.reuseOperand(inst, extra.a, 0, lhs_mcv))
+                lhs_mcv
+            else if (has_avx and lhs_mcv.isRegister())
+                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+            else
+                try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+
+            if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ .v_ps, .blend },
+                    8, 16 => .{ .v_pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                registerAlias(if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    dst_reg, dst_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                Immediate.u(expanded_control),
+            ) else try self.asmRegisterRegisterRegisterImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ .v_ps, .blend },
+                    8, 16 => .{ .v_pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                registerAlias(if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    dst_reg, dst_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                Immediate.u(expanded_control),
+            ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ ._ps, .blend },
+                    8, 16 => .{ ._pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                Immediate.u(expanded_control),
+            ) else try self.asmRegisterRegisterImmediate(
+                switch (elem_abi_size) {
+                    4 => .{ ._ps, .blend },
+                    8, 16 => .{ ._pd, .blend },
+                    else => unreachable,
+                },
+                registerAlias(dst_reg, dst_abi_size),
+                registerAlias(if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                Immediate.u(expanded_control),
+            );
+            break :result .{ .register = dst_reg };
+        }
+
+        blendv: {
+            if (dst_abi_size > @as(u32, if (if (elem_abi_size >= 4)
+                has_avx
+            else
+                self.hasFeature(.avx2)) 32 else 16)) break :blendv;
+
+            const select_mask_elem_ty = try mod.intType(.unsigned, elem_abi_size * 8);
+            const select_mask_ty = try mod.vectorType(.{
+                .len = @intCast(mask_elems.len),
+                .child = select_mask_elem_ty.toIntern(),
+            });
+            var select_mask_elems: [32]InternPool.Index = undefined;
+            for (
+                select_mask_elems[0..mask_elems.len],
+                mask_elems,
+                0..,
+            ) |*select_mask_elem, maybe_mask_elem, elem_index| {
+                const mask_elem = maybe_mask_elem orelse continue;
+                const mask_elem_index =
+                    math.cast(u5, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :blendv;
+                if (mask_elem_index != elem_index) break :blendv;
+
+                select_mask_elem.* = (if (mask_elem < 0)
+                    try select_mask_elem_ty.maxIntScalar(mod, select_mask_elem_ty)
+                else
+                    try select_mask_elem_ty.minIntScalar(mod, select_mask_elem_ty)).toIntern();
+            }
+            const select_mask_mcv = try self.genTypedValue(.{
+                .ty = select_mask_ty,
+                .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                    .ty = select_mask_ty.toIntern(),
+                    .storage = .{ .elems = select_mask_elems[0..mask_elems.len] },
+                } })),
+            });
+
+            if (self.hasFeature(.sse4_1)) {
+                const mir_tag: Mir.Inst.FixedTag = .{
+                    if ((elem_abi_size >= 4 and elem_ty.isRuntimeFloat()) or
+                        (dst_abi_size > 16 and !self.hasFeature(.avx2))) switch (elem_abi_size) {
+                        4 => if (has_avx) .v_ps else ._ps,
+                        8 => if (has_avx) .v_pd else ._pd,
+                        else => unreachable,
+                    } else if (has_avx) .vp_b else .p_b,
+                    .blendv,
+                };
+
+                const select_mask_reg = if (!has_avx) reg: {
+                    try self.register_manager.getKnownReg(.xmm0, null);
+                    try self.genSetReg(.xmm0, select_mask_elem_ty, select_mask_mcv, .{});
+                    break :reg .xmm0;
+                } else try self.copyToTmpRegister(select_mask_ty, select_mask_mcv);
+                const select_mask_alias = registerAlias(select_mask_reg, dst_abi_size);
+                const select_mask_lock = self.register_manager.lockRegAssumeUnused(select_mask_reg);
+                defer self.register_manager.unlockReg(select_mask_lock);
+
+                const lhs_mcv = try self.resolveInst(extra.a);
+                const rhs_mcv = try self.resolveInst(extra.b);
+
+                const dst_mcv: MCValue = if (lhs_mcv.isRegister() and
+                    self.reuseOperand(inst, extra.a, 0, lhs_mcv))
+                    lhs_mcv
+                else if (has_avx and lhs_mcv.isRegister())
+                    .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+                else
+                    try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv);
+                const dst_reg = dst_mcv.getReg().?;
+                const dst_alias = registerAlias(dst_reg, dst_abi_size);
+
+                if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryRegister(
+                    mir_tag,
+                    dst_alias,
+                    if (lhs_mcv.isRegister())
+                        registerAlias(lhs_mcv.getReg().?, dst_abi_size)
+                    else
+                        dst_alias,
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    select_mask_alias,
+                ) else try self.asmRegisterRegisterRegisterRegister(
+                    mir_tag,
+                    dst_alias,
+                    if (lhs_mcv.isRegister())
+                        registerAlias(lhs_mcv.getReg().?, dst_abi_size)
+                    else
+                        dst_alias,
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    select_mask_alias,
+                ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryRegister(
+                    mir_tag,
+                    dst_alias,
+                    try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+                    select_mask_alias,
+                ) else try self.asmRegisterRegisterRegister(
+                    mir_tag,
+                    dst_alias,
+                    registerAlias(if (rhs_mcv.isRegister())
+                        rhs_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size),
+                    select_mask_alias,
+                );
+                break :result dst_mcv;
+            }
+
+            const lhs_mcv = try self.resolveInst(extra.a);
+            const rhs_mcv = try self.resolveInst(extra.b);
+
+            const dst_mcv: MCValue = if (rhs_mcv.isRegister() and
+                self.reuseOperand(inst, extra.b, 1, rhs_mcv))
+                rhs_mcv
+            else
+                try self.copyToRegisterWithInstTracking(inst, dst_ty, rhs_mcv);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, dst_abi_size);
+
+            const mask_reg = try self.copyToTmpRegister(select_mask_ty, select_mask_mcv);
+            const mask_alias = registerAlias(mask_reg, dst_abi_size);
+            const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg);
+            defer self.register_manager.unlockReg(mask_lock);
+
+            const mir_fixes: Mir.Inst.Fixes = if (elem_ty.isRuntimeFloat())
+                switch (elem_ty.floatBits(self.target.*)) {
+                    16, 80, 128 => .p_,
+                    32 => ._ps,
+                    64 => ._pd,
+                    else => unreachable,
+                }
+            else
+                .p_;
+            try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_alias, mask_alias);
+            if (lhs_mcv.isMemory()) try self.asmRegisterMemory(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                try lhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)),
+            ) else try self.asmRegisterRegister(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(dst_ty, lhs_mcv),
+            );
+            try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_alias, mask_alias);
+            break :result dst_mcv;
+        }
+
         pshufb: {
             if (max_abi_size > 16) break :pshufb;
             if (!self.hasFeature(.ssse3)) break :pshufb;
@@ -17330,38 +17898,13 @@ fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
             break :result .{ .register = temp_regs[0] };
         }
 
-        if (max_abi_size <= 16) {
-            const lhs_mcv = try self.resolveInst(extra.a);
-            const lhs_reg = if (lhs_mcv.isRegister())
-                lhs_mcv.getReg().?
-            else
-                try self.copyToTmpRegister(lhs_ty, lhs_mcv);
-            const lhs_lock = self.register_manager.lockRegAssumeUnused(lhs_reg);
-            defer self.register_manager.unlockReg(lhs_lock);
-
-            const rhs_mcv = try self.resolveInst(extra.b);
-            const rhs_reg = if (rhs_mcv.isRegister())
-                rhs_mcv.getReg().?
-            else
-                try self.copyToTmpRegister(rhs_ty, rhs_mcv);
-            const rhs_lock = self.register_manager.lockReg(rhs_reg);
-            defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
-
-            //const dst_mcv = try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
-            switch (elem_ty.zigTypeTag(mod)) {
-                .Float => switch (elem_ty.floatBits(self.target.*)) {
-                    16, 32 => {},
-                    64 => unreachable, // fully handled by shufpd
-                    80, 128 => unreachable, // all possible masks already handled
-                    else => unreachable,
-                },
-                else => {},
-            }
-        }
-
         break :result null;
-    }) orelse return self.fail("TODO implement airShuffle from {} and {} to {}", .{
+    }) orelse return self.fail("TODO implement airShuffle from {} and {} to {} with {}", .{
         lhs_ty.fmt(mod), rhs_ty.fmt(mod), dst_ty.fmt(mod),
+        Value.fromInterned(extra.mask).fmtValue(
+            Type.fromInterned(mod.intern_pool.typeOf(extra.mask)),
+            mod,
+        ),
     });
     return self.finishAir(inst, result, .{ extra.a, extra.b, .none });
 }
src/arch/x86_64/Encoding.zig
@@ -416,7 +416,7 @@ pub const Mnemonic = enum {
     vfmadd132ss, vfmadd213ss, vfmadd231ss,
     // AVX2
     vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
-    vextracti128, vinserti128,
+    vextracti128, vinserti128, vpblendd,
     // zig fmt: on
 };
 
src/arch/x86_64/encodings.zig
@@ -1801,6 +1801,9 @@ pub const table = [_]Entry{
 
     .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpblendd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpblendd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_256_w0, .avx2 },
+
     .{ .vpblendvb, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_256_w0, .avx2 },
 
     .{ .vpblendw, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_256_wig, .avx2 },