Commit 88d0fef92d

Jacob Young <jacobly0@users.noreply.github.com>
2024-02-13 08:12:56
x86_64: implement `@select`
1 parent ab6f9e3
Changed files (6)
src/arch/x86_64/CodeGen.zig
@@ -1547,6 +1547,27 @@ fn asmRegisterRegisterMemory(
     });
 }
 
+fn asmRegisterRegisterMemoryRegister(
+    self: *Self,
+    tag: Mir.Inst.FixedTag,
+    reg1: Register,
+    reg2: Register,
+    m: Memory,
+    reg3: Register,
+) !void {
+    _ = try self.addInst(.{
+        .tag = tag[1],
+        .ops = .rrmr,
+        .data = .{ .rrrx = .{
+            .fixes = tag[0],
+            .r1 = reg1,
+            .r2 = reg2,
+            .r3 = reg3,
+            .payload = try self.addExtra(Mir.Memory.encode(m)),
+        } },
+    });
+}
+
 fn asmMemory(self: *Self, tag: Mir.Inst.FixedTag, m: Memory) !void {
     _ = try self.addInst(.{
         .tag = tag[1],
@@ -1570,6 +1591,25 @@ fn asmRegisterMemory(self: *Self, tag: Mir.Inst.FixedTag, reg: Register, m: Memo
     });
 }
 
+fn asmRegisterMemoryRegister(
+    self: *Self,
+    tag: Mir.Inst.FixedTag,
+    reg1: Register,
+    m: Memory,
+    reg2: Register,
+) !void {
+    _ = try self.addInst(.{
+        .tag = tag[1],
+        .ops = .rmr,
+        .data = .{ .rrx = .{
+            .fixes = tag[0],
+            .r1 = reg1,
+            .r2 = reg2,
+            .payload = try self.addExtra(Mir.Memory.encode(m)),
+        } },
+    });
+}
+
 fn asmRegisterMemoryImmediate(
     self: *Self,
     tag: Mir.Inst.FixedTag,
@@ -10820,96 +10860,35 @@ fn genBinOp(
                 lhs_copy_reg.?,
                 mask_reg,
             ) else {
-                try self.asmRegisterRegister(
-                    @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) {
-                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                            32 => .{ ._ps, .@"and" },
-                            64 => .{ ._pd, .@"and" },
-                            16, 80, 128 => null,
-                            else => unreachable,
-                        },
-                        .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
-                            .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
-                                32 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...4 => .{ ._ps, .@"and" },
-                                    else => null,
-                                },
-                                64 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...2 => .{ ._pd, .@"and" },
-                                    else => null,
-                                },
-                                16, 80, 128 => null,
-                                else => unreachable,
-                            },
-                            else => unreachable,
-                        },
+                const mir_fixes = @as(?Mir.Inst.Fixes, switch (lhs_ty.zigTypeTag(mod)) {
+                    .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => ._ps,
+                        64 => ._pd,
+                        16, 80, 128 => null,
                         else => unreachable,
-                    }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(air_tag), lhs_ty.fmt(mod),
-                    }),
-                    dst_reg,
-                    mask_reg,
-                );
-                try self.asmRegisterRegister(
-                    @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) {
-                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                            32 => .{ ._ps, .andn },
-                            64 => .{ ._pd, .andn },
-                            16, 80, 128 => null,
-                            else => unreachable,
-                        },
-                        .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
-                            .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
-                                32 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...4 => .{ ._ps, .andn },
-                                    else => null,
-                                },
-                                64 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...2 => .{ ._pd, .andn },
-                                    else => null,
-                                },
-                                16, 80, 128 => null,
-                                else => unreachable,
+                    },
+                    .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
+                        .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
+                            32 => switch (lhs_ty.vectorLen(mod)) {
+                                1...4 => ._ps,
+                                else => null,
                             },
-                            else => unreachable,
-                        },
-                        else => unreachable,
-                    }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(air_tag), lhs_ty.fmt(mod),
-                    }),
-                    mask_reg,
-                    lhs_copy_reg.?,
-                );
-                try self.asmRegisterRegister(
-                    @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) {
-                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                            32 => .{ ._ps, .@"or" },
-                            64 => .{ ._pd, .@"or" },
-                            16, 80, 128 => null,
-                            else => unreachable,
-                        },
-                        .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
-                            .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) {
-                                32 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...4 => .{ ._ps, .@"or" },
-                                    else => null,
-                                },
-                                64 => switch (lhs_ty.vectorLen(mod)) {
-                                    1...2 => .{ ._pd, .@"or" },
-                                    else => null,
-                                },
-                                16, 80, 128 => null,
-                                else => unreachable,
+                            64 => switch (lhs_ty.vectorLen(mod)) {
+                                1...2 => ._pd,
+                                else => null,
                             },
+                            16, 80, 128 => null,
                             else => unreachable,
                         },
                         else => unreachable,
-                    }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(air_tag), lhs_ty.fmt(mod),
-                    }),
-                    dst_reg,
-                    mask_reg,
-                );
+                    },
+                    else => unreachable,
+                }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(air_tag), lhs_ty.fmt(mod),
+                });
+                try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_reg, mask_reg);
+                try self.asmRegisterRegister(.{ mir_fixes, .andn }, mask_reg, lhs_copy_reg.?);
+                try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_reg, mask_reg);
             }
         },
         .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => {
@@ -16353,7 +16332,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     .{ if (self.hasFeature(.avx)) .vp_w else .p_w, .shufl },
                     dst_alias,
                     dst_alias,
-                    Immediate.u(0),
+                    Immediate.u(0b00_00_00_00),
                 );
                 if (switch (scalar_bits) {
                     1...8 => vector_len > 4,
@@ -16564,11 +16543,341 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.comp.module.?;
     const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
     const extra = self.air.extraData(Air.Bin, pl_op.payload).data;
-    _ = extra;
-    return self.fail("TODO implement airSelect for x86_64", .{});
-    //return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
+    const ty = self.typeOfIndex(inst);
+    const vec_len = ty.vectorLen(mod);
+    const elem_ty = ty.childType(mod);
+    const elem_abi_size: u32 = @intCast(elem_ty.abiSize(mod));
+    const abi_size = elem_abi_size * vec_len;
+    const pred_ty = self.typeOf(pl_op.operand);
+
+    const result = result: {
+        const has_blend = self.hasFeature(.sse4_1);
+        const has_avx = self.hasFeature(.avx);
+        const need_xmm0 = has_blend and !has_avx;
+        const pred_mcv = try self.resolveInst(pl_op.operand);
+        const mask_reg = mask: {
+            switch (pred_mcv) {
+                .register => |pred_reg| switch (pred_reg.class()) {
+                    .general_purpose => {},
+                    .sse => if (need_xmm0 and pred_reg.id() != comptime Register.xmm0.id()) {
+                        try self.register_manager.getReg(.xmm0, null);
+                        try self.genSetReg(.xmm0, pred_ty, pred_mcv, .{});
+                        break :mask .xmm0;
+                    } else break :mask if (has_blend)
+                        pred_reg
+                    else
+                        try self.copyToTmpRegister(pred_ty, pred_mcv),
+                    else => unreachable,
+                },
+                else => {},
+            }
+            const mask_reg: Register = if (need_xmm0) mask_reg: {
+                try self.register_manager.getReg(.xmm0, null);
+                break :mask_reg .xmm0;
+            } else try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+            const mask_alias = registerAlias(mask_reg, abi_size);
+            const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg);
+            defer self.register_manager.unlockReg(mask_lock);
+
+            const pred_fits_in_elem = vec_len <= elem_abi_size;
+            if (self.hasFeature(.avx2) and abi_size <= 32) {
+                if (pred_mcv.isRegister()) broadcast: {
+                    try self.asmRegisterRegister(
+                        .{ .v_d, .mov },
+                        mask_reg.to128(),
+                        pred_mcv.getReg().?.to32(),
+                    );
+                    if (pred_fits_in_elem and vec_len > 1) try self.asmRegisterRegister(
+                        .{ switch (elem_abi_size) {
+                            1 => .vp_b,
+                            2 => .vp_w,
+                            3...4 => .vp_d,
+                            5...8 => .vp_q,
+                            9...16 => {
+                                try self.asmRegisterRegisterRegisterImmediate(
+                                    .{ .v_f128, .insert },
+                                    mask_alias,
+                                    mask_alias,
+                                    mask_reg.to128(),
+                                    Immediate.u(1),
+                                );
+                                break :broadcast;
+                            },
+                            17...32 => break :broadcast,
+                            else => unreachable,
+                        }, .broadcast },
+                        mask_alias,
+                        mask_reg.to128(),
+                    );
+                } else try self.asmRegisterMemory(
+                    .{ switch (vec_len) {
+                        1...8 => .vp_b,
+                        9...16 => .vp_w,
+                        17...32 => .vp_d,
+                        else => unreachable,
+                    }, .broadcast },
+                    mask_alias,
+                    if (pred_mcv.isMemory()) try pred_mcv.mem(self, .byte) else .{
+                        .base = .{ .reg = (try self.copyToTmpRegister(
+                            Type.usize,
+                            pred_mcv.address(),
+                        )).to64() },
+                        .mod = .{ .rm = .{ .size = .byte } },
+                    },
+                );
+            } else if (abi_size <= 16) broadcast: {
+                try self.asmRegisterRegister(
+                    .{ if (has_avx) .v_d else ._d, .mov },
+                    mask_alias,
+                    (if (pred_mcv.isRegister())
+                        pred_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(pred_ty, pred_mcv.address())).to32(),
+                );
+                if (!pred_fits_in_elem or vec_len == 1) break :broadcast;
+                if (elem_abi_size <= 1) {
+                    if (has_avx) try self.asmRegisterRegisterRegister(
+                        .{ .vp_, .unpcklbw },
+                        mask_alias,
+                        mask_alias,
+                        mask_alias,
+                    ) else try self.asmRegisterRegister(
+                        .{ .p_, .unpcklbw },
+                        mask_alias,
+                        mask_alias,
+                    );
+                    if (abi_size <= 2) break :broadcast;
+                }
+                if (elem_abi_size <= 2) {
+                    try self.asmRegisterRegisterImmediate(
+                        .{ if (has_avx) .vp_w else .p_w, .shufl },
+                        mask_alias,
+                        mask_alias,
+                        Immediate.u(0b00_00_00_00),
+                    );
+                    if (abi_size <= 8) break :broadcast;
+                }
+                try self.asmRegisterRegisterImmediate(
+                    .{ if (has_avx) .vp_d else .p_d, .shuf },
+                    mask_alias,
+                    mask_alias,
+                    Immediate.u(switch (elem_abi_size) {
+                        1...2, 5...8 => 0b01_00_01_00,
+                        3...4 => 0b00_00_00_00,
+                        else => unreachable,
+                    }),
+                );
+            } else return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+            const elem_bits: u16 = @intCast(elem_abi_size * 8);
+            const mask_elem_ty = try mod.intType(.unsigned, elem_bits);
+            const mask_ty = try mod.vectorType(.{ .len = vec_len, .child = mask_elem_ty.toIntern() });
+            if (!pred_fits_in_elem) if (self.hasFeature(.ssse3)) {
+                var mask_elems: [32]InternPool.Index = undefined;
+                for (mask_elems[0..vec_len], 0..) |*elem, bit| elem.* = try mod.intern(.{ .int = .{
+                    .ty = mask_elem_ty.toIntern(),
+                    .storage = .{ .u64 = bit / elem_bits },
+                } });
+                const mask_mcv = try self.genTypedValue(.{
+                    .ty = mask_ty,
+                    .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                        .ty = mask_ty.toIntern(),
+                        .storage = .{ .elems = mask_elems[0..vec_len] },
+                    } })),
+                });
+                const mask_mem: Memory = .{
+                    .base = .{ .reg = try self.copyToTmpRegister(Type.usize, mask_mcv.address()) },
+                    .mod = .{ .rm = .{ .size = self.memSize(ty) } },
+                };
+                if (has_avx) try self.asmRegisterRegisterMemory(
+                    .{ .vp_b, .shuf },
+                    mask_alias,
+                    mask_alias,
+                    mask_mem,
+                ) else try self.asmRegisterMemory(
+                    .{ .p_b, .shuf },
+                    mask_alias,
+                    mask_mem,
+                );
+            } else return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+            {
+                var mask_elems: [32]InternPool.Index = undefined;
+                for (mask_elems[0..vec_len], 0..) |*elem, bit| elem.* = try mod.intern(.{ .int = .{
+                    .ty = mask_elem_ty.toIntern(),
+                    .storage = .{ .u64 = @as(u32, 1) << @intCast(bit & (elem_bits - 1)) },
+                } });
+                const mask_mcv = try self.genTypedValue(.{
+                    .ty = mask_ty,
+                    .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{
+                        .ty = mask_ty.toIntern(),
+                        .storage = .{ .elems = mask_elems[0..vec_len] },
+                    } })),
+                });
+                const mask_mem: Memory = .{
+                    .base = .{ .reg = try self.copyToTmpRegister(Type.usize, mask_mcv.address()) },
+                    .mod = .{ .rm = .{ .size = self.memSize(ty) } },
+                };
+                if (has_avx) {
+                    try self.asmRegisterRegisterMemory(
+                        .{ .vp_, .@"and" },
+                        mask_alias,
+                        mask_alias,
+                        mask_mem,
+                    );
+                    try self.asmRegisterRegisterMemory(
+                        .{ .vp_d, .cmpeq },
+                        mask_alias,
+                        mask_alias,
+                        mask_mem,
+                    );
+                } else {
+                    try self.asmRegisterMemory(
+                        .{ .p_, .@"and" },
+                        mask_alias,
+                        mask_mem,
+                    );
+                    try self.asmRegisterMemory(
+                        .{ .p_d, .cmpeq },
+                        mask_alias,
+                        mask_mem,
+                    );
+                }
+            }
+            break :mask mask_reg;
+        };
+        const mask_alias = registerAlias(mask_reg, abi_size);
+        const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg);
+        defer self.register_manager.unlockReg(mask_lock);
+
+        const lhs_mcv = try self.resolveInst(extra.lhs);
+        const lhs_lock = switch (lhs_mcv) {
+            .register => |lhs_reg| self.register_manager.lockRegAssumeUnused(lhs_reg),
+            else => null,
+        };
+        defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const rhs_mcv = try self.resolveInst(extra.rhs);
+        const rhs_lock = switch (rhs_mcv) {
+            .register => |rhs_reg| self.register_manager.lockReg(rhs_reg),
+            else => null,
+        };
+        defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const reuse_mcv = if (has_blend) rhs_mcv else lhs_mcv;
+        const dst_mcv: MCValue = if (reuse_mcv.isRegister() and self.reuseOperand(
+            inst,
+            if (has_blend) extra.rhs else extra.lhs,
+            @intFromBool(has_blend),
+            reuse_mcv,
+        )) reuse_mcv else if (has_avx)
+            .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }
+        else
+            try self.copyToRegisterWithInstTracking(inst, ty, reuse_mcv);
+        const dst_reg = dst_mcv.getReg().?;
+        const dst_alias = registerAlias(dst_reg, abi_size);
+        const dst_lock = self.register_manager.lockReg(dst_reg);
+        defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const mir_tag = @as(?Mir.Inst.FixedTag, switch (ty.childType(mod).zigTypeTag(mod)) {
+            else => null,
+            .Int => switch (abi_size) {
+                0 => unreachable,
+                1...16 => if (has_avx)
+                    .{ .vp_b, .blendv }
+                else if (has_blend)
+                    .{ .p_b, .blendv }
+                else
+                    .{ .p_, undefined },
+                17...32 => if (self.hasFeature(.avx2))
+                    .{ .vp_b, .blendv }
+                else
+                    null,
+                else => null,
+            },
+            .Float => switch (ty.childType(mod).floatBits(self.target.*)) {
+                else => unreachable,
+                16, 80, 128 => null,
+                32 => switch (vec_len) {
+                    0 => unreachable,
+                    1...4 => if (has_avx) .{ .v_ps, .blendv } else .{ ._ps, .blendv },
+                    5...8 => if (has_avx) .{ .v_ps, .blendv } else null,
+                    else => null,
+                },
+                64 => switch (vec_len) {
+                    0 => unreachable,
+                    1...2 => if (has_avx) .{ .v_pd, .blendv } else .{ ._pd, .blendv },
+                    3...4 => if (has_avx) .{ .v_pd, .blendv } else null,
+                    else => null,
+                },
+            },
+        }) orelse return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+        if (has_avx) {
+            const rhs_alias = if (rhs_mcv.isRegister())
+                registerAlias(rhs_mcv.getReg().?, abi_size)
+            else rhs: {
+                try self.genSetReg(dst_reg, ty, rhs_mcv, .{});
+                break :rhs dst_alias;
+            };
+            if (lhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryRegister(
+                mir_tag,
+                dst_alias,
+                rhs_alias,
+                try lhs_mcv.mem(self, self.memSize(ty)),
+                mask_alias,
+            ) else try self.asmRegisterRegisterRegisterRegister(
+                mir_tag,
+                dst_alias,
+                rhs_alias,
+                registerAlias(if (lhs_mcv.isRegister())
+                    lhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(ty, lhs_mcv), abi_size),
+                mask_alias,
+            );
+        } else if (has_blend) if (lhs_mcv.isMemory()) try self.asmRegisterMemoryRegister(
+            mir_tag,
+            dst_alias,
+            try lhs_mcv.mem(self, self.memSize(ty)),
+            mask_alias,
+        ) else try self.asmRegisterRegisterRegister(
+            mir_tag,
+            dst_alias,
+            registerAlias(if (lhs_mcv.isRegister())
+                lhs_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(ty, lhs_mcv), abi_size),
+            mask_alias,
+        ) else {
+            const mir_fixes = @as(?Mir.Inst.Fixes, switch (elem_ty.zigTypeTag(mod)) {
+                else => null,
+                .Int => .p_,
+                .Float => switch (elem_ty.floatBits(self.target.*)) {
+                    32 => ._ps,
+                    64 => ._pd,
+                    16, 80, 128 => null,
+                    else => unreachable,
+                },
+            }) orelse return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)});
+            try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_alias, mask_alias);
+            if (rhs_mcv.isMemory()) try self.asmRegisterMemory(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)),
+            ) else try self.asmRegisterRegister(
+                .{ mir_fixes, .andn },
+                mask_alias,
+                if (rhs_mcv.isRegister())
+                    rhs_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(ty, rhs_mcv),
+            );
+            try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_alias, mask_alias);
+        }
+        break :result dst_mcv;
+    };
+    return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
 fn airShuffle(self: *Self, inst: Air.Inst.Index) !void {
src/arch/x86_64/Encoding.zig
@@ -330,6 +330,7 @@ pub const Mnemonic = enum {
     extractps,
     insertps,
     packusdw,
+    pblendvb, pblendw,
     pcmpeqq,
     pextrb, pextrd, pextrq,
     pinsrb, pinsrd, pinsrq,
@@ -377,7 +378,8 @@ pub const Mnemonic = enum {
     vpabsb, vpabsd, vpabsw,
     vpackssdw, vpacksswb, vpackusdw, vpackuswb,
     vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
-    vpalignr, vpand, vpandn, vpclmulqdq,
+    vpalignr, vpand, vpandn,
+    vpblendvb, vpblendw, vpclmulqdq,
     vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
     vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
     vpextrb, vpextrd, vpextrq, vpextrw,
src/arch/x86_64/encodings.zig
@@ -1202,6 +1202,11 @@ pub const table = [_]Entry{
 
     .{ .packusdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .none, .sse4_1 },
 
+    .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128        }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 },
+    .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 },
+
+    .{ .pblendw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .none, .sse4_1 },
+
     .{ .pcmpeqq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x29 }, 0, .none, .sse4_1 },
 
     .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
@@ -1528,6 +1533,10 @@ pub const table = [_]Entry{
 
     .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx },
 
+    .{ .vpblendvb, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_128_w0, .avx },
+
+    .{ .vpblendw, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_128_wig, .avx },
+
     .{ .vpclmulqdq, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .vex_128_wig, .@"pclmul avx" },
 
     .{ .vpcmpeqb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_128_wig, .avx },
@@ -1756,6 +1765,10 @@ pub const table = [_]Entry{
 
     .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpblendvb, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpblendw, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpbroadcastb,    .rm, &.{ .xmm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_128_w0, .avx2 },
     .{ .vpbroadcastb,    .rm, &.{ .ymm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_256_w0, .avx2 },
     .{ .vpbroadcastw,    .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x79 }, 0, .vex_128_w0, .avx2 },
src/arch/x86_64/Lower.zig
@@ -477,8 +477,9 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
         .rri_s, .rri_u => inst.data.rri.fixes,
         .ri_s, .ri_u => inst.data.ri.fixes,
         .ri64, .rm, .rmi_s, .mr => inst.data.rx.fixes,
-        .mrr, .rrm => inst.data.rrx.fixes,
+        .mrr, .rrm, .rmr => inst.data.rrx.fixes,
         .rmi, .mri => inst.data.rix.fixes,
+        .rrmr => inst.data.rrrx.fixes,
         .rrmi => inst.data.rrix.fixes,
         .mi_u, .mi_s => inst.data.x.fixes,
         .m => inst.data.x.fixes,
@@ -565,6 +566,11 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .reg = inst.data.rx.r1 },
             .{ .mem = lower.mem(inst.data.rx.payload) },
         },
+        .rmr => &.{
+            .{ .reg = inst.data.rrx.r1 },
+            .{ .mem = lower.mem(inst.data.rrx.payload) },
+            .{ .reg = inst.data.rrx.r2 },
+        },
         .rmi => &.{
             .{ .reg = inst.data.rix.r1 },
             .{ .mem = lower.mem(inst.data.rix.payload) },
@@ -597,6 +603,12 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .reg = inst.data.rrx.r2 },
             .{ .mem = lower.mem(inst.data.rrx.payload) },
         },
+        .rrmr => &.{
+            .{ .reg = inst.data.rrrx.r1 },
+            .{ .reg = inst.data.rrrx.r2 },
+            .{ .mem = lower.mem(inst.data.rrrx.payload) },
+            .{ .reg = inst.data.rrrx.r3 },
+        },
         .rrmi => &.{
             .{ .reg = inst.data.rrix.r1 },
             .{ .reg = inst.data.rrix.r2 },
src/arch/x86_64/Mir.zig
@@ -762,8 +762,11 @@ pub const Inst = struct {
         /// Uses `imm` payload.
         rel,
         /// Register, memory operands.
-        /// Uses `rx` payload.
+        /// Uses `rx` payload with extra data of type `Memory`.
         rm,
+        /// Register, memory, register operands.
+        /// Uses `rrx` payload with extra data of type `Memory`.
+        rmr,
         /// Register, memory, immediate (word) operands.
         /// Uses `rix` payload with extra data of type `Memory`.
         rmi,
@@ -776,6 +779,9 @@ pub const Inst = struct {
         /// Register, register, memory.
         /// Uses `rrix` payload with extra data of type `Memory`.
         rrm,
+        /// Register, register, memory, register.
+        /// Uses `rrrx` payload with extra data of type `Memory`.
+        rrmr,
         /// Register, register, memory, immediate (byte) operands.
         /// Uses `rrix` payload with extra data of type `Memory`.
         rrmi,
@@ -953,6 +959,14 @@ pub const Inst = struct {
             r2: Register,
             payload: u32,
         },
+        /// Register, register, register, followed by Custom payload found in extra.
+        rrrx: struct {
+            fixes: Fixes = ._,
+            r1: Register,
+            r2: Register,
+            r3: Register,
+            payload: u32,
+        },
         /// Register, byte immediate, followed by Custom payload found in extra.
         rix: struct {
             fixes: Fixes = ._,
test/behavior/select.zig
@@ -5,7 +5,6 @@ const expect = std.testing.expect;
 
 test "@select vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -36,11 +35,12 @@ fn selectVectors() !void {
 
 test "@select arrays" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) return error.SkipZigTest;
 
     try comptime selectArrays();
     try selectArrays();