Commit 403c2d91be

Jacob Young <jacobly0@users.noreply.github.com>
2023-05-16 02:17:06
x86_64: fix float min/max behavior
1 parent b9d2e0e
src/arch/x86_64/CodeGen.zig
@@ -1271,6 +1271,27 @@ fn asmRegisterRegisterRegister(
     });
 }
 
+fn asmRegisterRegisterRegisterRegister(
+    self: *Self,
+    tag: Mir.Inst.FixedTag,
+    reg1: Register,
+    reg2: Register,
+    reg3: Register,
+    reg4: Register,
+) !void {
+    _ = try self.addInst(.{
+        .tag = tag[1],
+        .ops = .rrrr,
+        .data = .{ .rrrr = .{
+            .fixes = tag[0],
+            .r1 = reg1,
+            .r2 = reg2,
+            .r3 = reg3,
+            .r4 = reg4,
+        } },
+    });
+}
+
 fn asmRegisterRegisterRegisterImmediate(
     self: *Self,
     tag: Mir.Inst.FixedTag,
@@ -6224,12 +6245,26 @@ fn genBinOp(
     lhs_air: Air.Inst.Ref,
     rhs_air: Air.Inst.Ref,
 ) !MCValue {
-    const lhs_mcv = try self.resolveInst(lhs_air);
-    const rhs_mcv = try self.resolveInst(rhs_air);
     const lhs_ty = self.air.typeOf(lhs_air);
     const rhs_ty = self.air.typeOf(rhs_air);
     const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
 
+    const maybe_mask_reg = switch (air_tag) {
+        else => null,
+        .max, .min => if (lhs_ty.scalarType().isRuntimeFloat()) registerAlias(
+            if (!self.hasFeature(.avx) and self.hasFeature(.sse4_1)) mask: {
+                try self.register_manager.getReg(.xmm0, null);
+                break :mask .xmm0;
+            } else try self.register_manager.allocReg(null, sse),
+            abi_size,
+        ) else null,
+    };
+    const mask_lock =
+        if (maybe_mask_reg) |mask_reg| self.register_manager.lockRegAssumeUnused(mask_reg) else null;
+    defer if (mask_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const lhs_mcv = try self.resolveInst(lhs_air);
+    const rhs_mcv = try self.resolveInst(rhs_air);
     switch (lhs_mcv) {
         .immediate => |imm| switch (imm) {
             0 => switch (air_tag) {
@@ -6300,7 +6335,16 @@ fn genBinOp(
     };
     defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const src_mcv = if (flipped) lhs_mcv else rhs_mcv;
+    const unmat_src_mcv = if (flipped) lhs_mcv else rhs_mcv;
+    const src_mcv: MCValue = if (maybe_mask_reg) |mask_reg|
+        if (self.hasFeature(.avx) and unmat_src_mcv.isRegister() and maybe_inst != null and
+            self.liveness.operandDies(maybe_inst.?, if (flipped) 0 else 1)) unmat_src_mcv else src: {
+            try self.genSetReg(mask_reg, rhs_ty, unmat_src_mcv);
+            break :src .{ .register = mask_reg };
+        }
+    else
+        unmat_src_mcv;
+
     if (!vec_op) {
         switch (air_tag) {
             .add,
@@ -7009,18 +7053,26 @@ fn genBinOp(
     })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
         @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
     });
+
+    const lhs_copy_reg = if (maybe_mask_reg) |_| registerAlias(
+        if (copied_to_dst) try self.copyToTmpRegister(lhs_ty, dst_mcv) else lhs_mcv.getReg().?,
+        abi_size,
+    ) else null;
+    const lhs_copy_lock = if (lhs_copy_reg) |reg| self.register_manager.lockReg(reg) else null;
+    defer if (lhs_copy_lock) |lock| self.register_manager.unlockReg(lock);
+
     if (self.hasFeature(.avx)) {
-        const src1_alias =
+        const lhs_reg =
             if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
         if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
             mir_tag,
             dst_reg,
-            src1_alias,
+            lhs_reg,
             src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
         ) else try self.asmRegisterRegisterRegister(
             mir_tag,
             dst_reg,
-            src1_alias,
+            lhs_reg,
             registerAlias(if (src_mcv.isRegister())
                 src_mcv.getReg().?
             else
@@ -7041,9 +7093,10 @@ fn genBinOp(
                 try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
         );
     }
+
     switch (air_tag) {
         .add, .addwrap, .sub, .subwrap, .mul, .mulwrap, .div_float, .div_exact => {},
-        .div_trunc, .div_floor => try self.genRound(
+        .div_trunc, .div_floor => if (self.hasFeature(.sse4_1)) try self.genRound(
             lhs_ty,
             dst_reg,
             .{ .register = dst_reg },
@@ -7052,11 +7105,240 @@ fn genBinOp(
                 .div_floor => 0b1_0_01,
                 else => unreachable,
             },
-        ),
+        ) else return self.fail("TODO implement genBinOp for {s} {} without sse4_1 feature", .{
+            @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+        }),
         .bit_and, .bit_or, .xor => {},
-        .max, .min => {}, // TODO: unordered select
+        .max, .min => if (maybe_mask_reg) |mask_reg| if (self.hasFeature(.avx)) {
+            const rhs_copy_reg = registerAlias(src_mcv.getReg().?, abi_size);
+
+            try self.asmRegisterRegisterRegisterImmediate(
+                if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                    .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => .{ .v_ss, .cmp },
+                        64 => .{ .v_sd, .cmp },
+                        16, 80, 128 => null,
+                        else => unreachable,
+                    },
+                    .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                        .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                            32 => switch (lhs_ty.vectorLen()) {
+                                1 => .{ .v_ss, .cmp },
+                                2...8 => .{ .v_ps, .cmp },
+                                else => null,
+                            },
+                            64 => switch (lhs_ty.vectorLen()) {
+                                1 => .{ .v_sd, .cmp },
+                                2...4 => .{ .v_pd, .cmp },
+                                else => null,
+                            },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    },
+                    else => unreachable,
+                })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+                mask_reg,
+                rhs_copy_reg,
+                rhs_copy_reg,
+                Immediate.u(3), // unord
+            );
+            try self.asmRegisterRegisterRegisterRegister(
+                if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                    .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => .{ .v_ps, .blendv },
+                        64 => .{ .v_pd, .blendv },
+                        16, 80, 128 => null,
+                        else => unreachable,
+                    },
+                    .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                        .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                            32 => switch (lhs_ty.vectorLen()) {
+                                1...8 => .{ .v_ps, .blendv },
+                                else => null,
+                            },
+                            64 => switch (lhs_ty.vectorLen()) {
+                                1...4 => .{ .v_pd, .blendv },
+                                else => null,
+                            },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    },
+                    else => unreachable,
+                })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+                dst_reg,
+                dst_reg,
+                lhs_copy_reg.?,
+                mask_reg,
+            );
+        } else {
+            const has_blend = self.hasFeature(.sse4_1);
+            try self.asmRegisterRegisterImmediate(
+                if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                    .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => .{ ._ss, .cmp },
+                        64 => .{ ._sd, .cmp },
+                        16, 80, 128 => null,
+                        else => unreachable,
+                    },
+                    .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                        .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                            32 => switch (lhs_ty.vectorLen()) {
+                                1 => .{ ._ss, .cmp },
+                                2...4 => .{ ._ps, .cmp },
+                                else => null,
+                            },
+                            64 => switch (lhs_ty.vectorLen()) {
+                                1 => .{ ._sd, .cmp },
+                                2 => .{ ._pd, .cmp },
+                                else => null,
+                            },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    },
+                    else => unreachable,
+                })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+                mask_reg,
+                mask_reg,
+                Immediate.u(if (has_blend) 3 else 7), // unord, ord
+            );
+            if (has_blend) try self.asmRegisterRegisterRegister(
+                if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                    .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => .{ ._ps, .blendv },
+                        64 => .{ ._pd, .blendv },
+                        16, 80, 128 => null,
+                        else => unreachable,
+                    },
+                    .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                        .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                            32 => switch (lhs_ty.vectorLen()) {
+                                1...4 => .{ ._ps, .blendv },
+                                else => null,
+                            },
+                            64 => switch (lhs_ty.vectorLen()) {
+                                1...2 => .{ ._pd, .blendv },
+                                else => null,
+                            },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    },
+                    else => unreachable,
+                })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+                dst_reg,
+                lhs_copy_reg.?,
+                mask_reg,
+            ) else {
+                try self.asmRegisterRegister(
+                    if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                            32 => .{ ._ps, .@"and" },
+                            64 => .{ ._pd, .@"and" },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                            .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                                32 => switch (lhs_ty.vectorLen()) {
+                                    1...4 => .{ ._ps, .@"and" },
+                                    else => null,
+                                },
+                                64 => switch (lhs_ty.vectorLen()) {
+                                    1...2 => .{ ._pd, .@"and" },
+                                    else => null,
+                                },
+                                16, 80, 128 => null,
+                                else => unreachable,
+                            },
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                        @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                    dst_reg,
+                    mask_reg,
+                );
+                try self.asmRegisterRegister(
+                    if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                            32 => .{ ._ps, .andn },
+                            64 => .{ ._pd, .andn },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                            .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                                32 => switch (lhs_ty.vectorLen()) {
+                                    1...4 => .{ ._ps, .andn },
+                                    else => null,
+                                },
+                                64 => switch (lhs_ty.vectorLen()) {
+                                    1...2 => .{ ._pd, .andn },
+                                    else => null,
+                                },
+                                16, 80, 128 => null,
+                                else => unreachable,
+                            },
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                        @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                    mask_reg,
+                    lhs_copy_reg.?,
+                );
+                try self.asmRegisterRegister(
+                    if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) {
+                        .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                            32 => .{ ._ps, .@"or" },
+                            64 => .{ ._pd, .@"or" },
+                            16, 80, 128 => null,
+                            else => unreachable,
+                        },
+                        .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                            .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                                32 => switch (lhs_ty.vectorLen()) {
+                                    1...4 => .{ ._ps, .@"or" },
+                                    else => null,
+                                },
+                                64 => switch (lhs_ty.vectorLen()) {
+                                    1...2 => .{ ._pd, .@"or" },
+                                    else => null,
+                                },
+                                16, 80, 128 => null,
+                                else => unreachable,
+                            },
+                            else => unreachable,
+                        },
+                        else => unreachable,
+                    })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                        @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                    dst_reg,
+                    mask_reg,
+                );
+            }
+        },
         else => unreachable,
     }
+
     return dst_mcv;
 }
 
@@ -9282,7 +9564,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
                             17...32 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else null,
                             else => null,
                         },
-                        .Float => switch (ty.floatBits(self.target.*)) {
+                        .Float => switch (ty.scalarType().floatBits(self.target.*)) {
                             16, 128 => switch (abi_size) {
                                 2...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov },
                                 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov },
src/arch/x86_64/encoder.zig
@@ -226,8 +226,8 @@ pub const Instruction = struct {
             else => {
                 const mem_op = switch (data.op_en) {
                     .m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0],
-                    .rm, .rmi, .vmi => inst.ops[1],
-                    .rvm, .rvmi => inst.ops[2],
+                    .rm, .rmi, .rm0, .vmi => inst.ops[1],
+                    .rvm, .rvmr, .rvmi => inst.ops[2],
                     else => unreachable,
                 };
                 switch (mem_op) {
@@ -235,7 +235,7 @@ pub const Instruction = struct {
                         const rm = switch (data.op_en) {
                             .m, .mi, .m1, .mc, .vmi => enc.modRmExt(),
                             .mr, .mri, .mrc => inst.ops[1].reg.lowEnc(),
-                            .rm, .rmi, .rvm, .rvmi => inst.ops[0].reg.lowEnc(),
+                            .rm, .rmi, .rm0, .rvm, .rvmr, .rvmi => inst.ops[0].reg.lowEnc(),
                             .mvr => inst.ops[2].reg.lowEnc(),
                             else => unreachable,
                         };
@@ -245,7 +245,7 @@ pub const Instruction = struct {
                         const op = switch (data.op_en) {
                             .m, .mi, .m1, .mc, .vmi => .none,
                             .mr, .mri, .mrc => inst.ops[1],
-                            .rm, .rmi, .rvm, .rvmi => inst.ops[0],
+                            .rm, .rmi, .rm0, .rvm, .rvmr, .rvmi => inst.ops[0],
                             .mvr => inst.ops[2],
                             else => unreachable,
                         };
@@ -257,6 +257,7 @@ pub const Instruction = struct {
                 switch (data.op_en) {
                     .mi => try encodeImm(inst.ops[1].imm, data.ops[1], encoder),
                     .rmi, .mri, .vmi => try encodeImm(inst.ops[2].imm, data.ops[2], encoder),
+                    .rvmr => try encoder.imm8(@as(u8, inst.ops[3].reg.enc()) << 4),
                     .rvmi => try encodeImm(inst.ops[3].imm, data.ops[3], encoder),
                     else => {},
                 }
@@ -298,7 +299,7 @@ pub const Instruction = struct {
             .i, .zi, .o, .oi, .d, .np => null,
             .fd => inst.ops[1].mem.base().reg,
             .td => inst.ops[0].mem.base().reg,
-            .rm, .rmi => if (inst.ops[1].isSegmentRegister())
+            .rm, .rmi, .rm0 => if (inst.ops[1].isSegmentRegister())
                 switch (inst.ops[1]) {
                     .reg => |reg| reg,
                     .mem => |mem| mem.base().reg,
@@ -314,7 +315,7 @@ pub const Instruction = struct {
                 }
             else
                 null,
-            .vmi, .rvm, .rvmi, .mvr => unreachable,
+            .vmi, .rvm, .rvmr, .rvmi, .mvr => unreachable,
         };
         if (segment_override) |seg| {
             legacy.setSegmentOverride(seg);
@@ -333,23 +334,23 @@ pub const Instruction = struct {
         switch (op_en) {
             .np, .i, .zi, .fd, .td, .d => {},
             .o, .oi => rex.b = inst.ops[0].reg.isExtended(),
-            .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc => {
+            .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rm0 => {
                 const r_op = switch (op_en) {
-                    .rm, .rmi => inst.ops[0],
+                    .rm, .rmi, .rm0 => inst.ops[0],
                     .mr, .mri, .mrc => inst.ops[1],
                     else => .none,
                 };
                 rex.r = r_op.isBaseExtended();
 
                 const b_x_op = switch (op_en) {
-                    .rm, .rmi => inst.ops[1],
+                    .rm, .rmi, .rm0 => inst.ops[1],
                     .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0],
                     else => unreachable,
                 };
                 rex.b = b_x_op.isBaseExtended();
                 rex.x = b_x_op.isIndexExtended();
             },
-            .vmi, .rvm, .rvmi, .mvr => unreachable,
+            .vmi, .rvm, .rvmr, .rvmi, .mvr => unreachable,
         }
 
         try encoder.rex(rex);
@@ -367,9 +368,9 @@ pub const Instruction = struct {
         switch (op_en) {
             .np, .i, .zi, .fd, .td, .d => {},
             .o, .oi => vex.b = inst.ops[0].reg.isExtended(),
-            .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi, .mvr => {
+            .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rm0, .vmi, .rvm, .rvmr, .rvmi, .mvr => {
                 const r_op = switch (op_en) {
-                    .rm, .rmi, .rvm, .rvmi => inst.ops[0],
+                    .rm, .rmi, .rm0, .rvm, .rvmr, .rvmi => inst.ops[0],
                     .mr, .mri, .mrc => inst.ops[1],
                     .mvr => inst.ops[2],
                     .m, .mi, .m1, .mc, .vmi => .none,
@@ -378,9 +379,9 @@ pub const Instruction = struct {
                 vex.r = r_op.isBaseExtended();
 
                 const b_x_op = switch (op_en) {
-                    .rm, .rmi, .vmi => inst.ops[1],
+                    .rm, .rmi, .rm0, .vmi => inst.ops[1],
                     .m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0],
-                    .rvm, .rvmi => inst.ops[2],
+                    .rvm, .rvmr, .rvmi => inst.ops[2],
                     else => unreachable,
                 };
                 vex.b = b_x_op.isBaseExtended();
@@ -408,7 +409,7 @@ pub const Instruction = struct {
         switch (op_en) {
             else => {},
             .vmi => vex.v = inst.ops[0].reg,
-            .rvm, .rvmi => vex.v = inst.ops[1].reg,
+            .rvm, .rvmr, .rvmi => vex.v = inst.ops[1].reg,
         }
 
         try encoder.vex(vex);
src/arch/x86_64/Encoding.zig
@@ -178,7 +178,7 @@ pub fn format(
             try writer.print("+{s} ", .{tag});
         },
         .m, .mi, .m1, .mc, .vmi => try writer.print("/{d} ", .{encoding.modRmExt()}),
-        .mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi, .mvr => try writer.writeAll("/r "),
+        .mr, .rm, .rmi, .mri, .mrc, .rm0, .rvm, .rvmr, .rvmi, .mvr => try writer.writeAll("/r "),
     }
 
     switch (encoding.data.op_en) {
@@ -202,7 +202,8 @@ pub fn format(
             };
             try writer.print("{s} ", .{tag});
         },
-        .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm, .mvr => {},
+        .rvmr => try writer.writeAll("/is4 "),
+        .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rm0, .rvm, .mvr => {},
     }
 
     try writer.print("{s} ", .{@tagName(encoding.mnemonic)});
@@ -270,7 +271,7 @@ pub const Mnemonic = enum {
     addps, addss,
     andps,
     andnps,
-    cmpss,
+    cmpps, cmpss,
     cvtpi2ps, cvtps2pi, cvtsi2ss, cvtss2si, cvttps2pi, cvttss2si,
     divps, divss,
     maxps, maxss,
@@ -290,7 +291,7 @@ pub const Mnemonic = enum {
     addpd, addsd,
     andpd,
     andnpd,
-    //cmpsd,
+    cmppd, //cmpsd,
     cvtdq2pd, cvtdq2ps, cvtpd2dq, cvtpd2pi, cvtpd2ps, cvtpi2pd,
     cvtps2dq, cvtps2pd, cvtsd2si, cvtsd2ss, cvtsi2sd, cvtss2sd,
     cvttpd2dq, cvttpd2pi, cvttps2dq, cvttsd2si,
@@ -315,6 +316,7 @@ pub const Mnemonic = enum {
     // SSE3
     movddup, movshdup, movsldup,
     // SSE4.1
+    blendpd, blendps, blendvpd, blendvps,
     extractps,
     insertps,
     pextrb, pextrd, pextrq,
@@ -325,7 +327,9 @@ pub const Mnemonic = enum {
     // AVX
     vaddpd, vaddps, vaddsd, vaddss,
     vandnpd, vandnps, vandpd, vandps,
+    vblendpd, vblendps, vblendvpd, vblendvps,
     vbroadcastf128, vbroadcastsd, vbroadcastss,
+    vcmppd, vcmpps, vcmpsd, vcmpss,
     vcvtdq2pd, vcvtdq2ps, vcvtpd2dq, vcvtpd2ps,
     vcvtps2dq, vcvtps2pd, vcvtsd2si, vcvtsd2ss,
     vcvtsi2sd, vcvtsi2ss, vcvtss2sd, vcvtss2si,
@@ -385,7 +389,7 @@ pub const OpEn = enum {
     fd, td,
     m1, mc, mi, mr, rm,
     rmi, mri, mrc,
-    vmi, rvm, rvmi, mvr,
+    rm0, vmi, rvm, rvmr, rvmi, mvr,
     // zig fmt: on
 };
 
@@ -407,7 +411,7 @@ pub const Op = enum {
     moffs,
     sreg,
     st, mm, mm_m64,
-    xmm, xmm_m32, xmm_m64, xmm_m128,
+    xmm0, xmm, xmm_m32, xmm_m64, xmm_m128,
     ymm, ymm_m256,
     // zig fmt: on
 
@@ -436,7 +440,9 @@ pub const Op = enum {
                 .segment => .sreg,
                 .x87 => .st,
                 .mmx => .mm,
-                .sse => switch (reg.bitSize()) {
+                .sse => if (reg == .xmm0)
+                    .xmm0
+                else switch (reg.bitSize()) {
                     128 => .xmm,
                     256 => .ymm,
                     else => unreachable,
@@ -494,7 +500,7 @@ pub const Op = enum {
             .eax, .r32, .rm32, .r32_m16 => unreachable,
             .rax, .r64, .rm64, .r64_m16 => unreachable,
             .st, .mm, .mm_m64 => unreachable,
-            .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable,
+            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable,
             .ymm, .ymm_m256 => unreachable,
             .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
             .unity => 1,
@@ -516,7 +522,7 @@ pub const Op = enum {
             .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32,
             .rax, .r64, .rm64, .r64_m16, .mm, .mm_m64 => 64,
             .st => 80,
-            .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
+            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
             .ymm, .ymm_m256 => 256,
         };
     }
@@ -526,7 +532,8 @@ pub const Op = enum {
             .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable,
             .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
             .rel8, .rel16, .rel32 => unreachable,
-            .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .st, .mm, .xmm, .ymm => unreachable,
+            .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64 => unreachable,
+            .st, .mm, .xmm0, .xmm, .ymm => unreachable,
             .m8, .rm8, .r32_m8 => 8,
             .m16, .rm16, .r32_m16, .r64_m16 => 16,
             .m32, .rm32, .xmm_m32 => 32,
@@ -558,7 +565,7 @@ pub const Op = enum {
             .rm8, .rm16, .rm32, .rm64,
             .r32_m8, .r32_m16, .r64_m16,
             .st, .mm, .mm_m64,
-            .xmm, .xmm_m32, .xmm_m64, .xmm_m128,
+            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128,
             .ymm, .ymm_m256,
             => true,
             else => false,
@@ -612,7 +619,7 @@ pub const Op = enum {
             .sreg => .segment,
             .st => .x87,
             .mm, .mm_m64 => .mmx,
-            .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .sse,
+            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .sse,
             .ymm, .ymm_m256 => .sse,
         };
     }
@@ -629,7 +636,7 @@ pub const Op = enum {
             else => {
                 if (op.isRegister() and target.isRegister()) {
                     return switch (target) {
-                        .cl, .al, .ax, .eax, .rax => op == target,
+                        .cl, .al, .ax, .eax, .rax, .xmm0 => op == target,
                         else => op.class() == target.class() and op.regBitSize() == target.regBitSize(),
                     };
                 }
src/arch/x86_64/encodings.zig
@@ -846,6 +846,8 @@ pub const table = [_]Entry{
 
     .{ .andps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .none, .sse },
 
+    .{ .cmpps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc2 }, 0, .none, .sse },
+
     .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .none, .sse },
 
     .{ .cvtpi2ps, .rm, &.{ .xmm, .mm_m64 }, &.{ 0x0f, 0x2a }, 0, .none, .sse },
@@ -917,6 +919,8 @@ pub const table = [_]Entry{
 
     .{ .andpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x54 }, 0, .none, .sse2 },
 
+    .{ .cmppd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc2 }, 0, .none, .sse2 },
+
     .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .none, .sse2 },
 
     .{ .cvtdq2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .none, .sse2 },
@@ -1085,6 +1089,14 @@ pub const table = [_]Entry{
     .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 },
 
     // SSE4.1
+    .{ .blendpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 },
+
+    .{ .blendps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .none, .sse4_1 },
+
+    .{ .blendvpd, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x15 }, 0, .none, .sse4_1 },
+
+    .{ .blendvps, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x14 }, 0, .none, .sse4_1 },
+
     .{ .extractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .none, .sse4_1 },
 
     .{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 },
@@ -1146,11 +1158,33 @@ pub const table = [_]Entry{
     .{ .vandps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .vex_128_wig, .avx },
     .{ .vandps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x54 }, 0, .vex_256_wig, .avx },
 
+    .{ .vblendpd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .vex_128_wig, .avx },
+    .{ .vblendpd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .vex_256_wig, .avx },
+
+    .{ .vblendps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .vex_128_wig, .avx },
+    .{ .vblendps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .vex_256_wig, .avx },
+
+    .{ .vblendvpd, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4b }, 0, .vex_128_w0, .avx },
+    .{ .vblendvpd, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4b }, 0, .vex_256_w0, .avx },
+
+    .{ .vblendvps, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4a }, 0, .vex_128_w0, .avx },
+    .{ .vblendvps, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4a }, 0, .vex_256_w0, .avx },
+
     .{ .vbroadcastss,   .rm, &.{ .xmm, .m32  }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx },
     .{ .vbroadcastss,   .rm, &.{ .ymm, .m32  }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx },
     .{ .vbroadcastsd,   .rm, &.{ .ymm, .m64  }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx },
     .{ .vbroadcastf128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x1a }, 0, .vex_256_w0, .avx },
 
+    .{ .vcmppd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc2 }, 0, .vex_128_wig, .avx },
+    .{ .vcmppd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0xc2 }, 0, .vex_256_wig, .avx },
+
+    .{ .vcmpps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc2 }, 0, .vex_128_wig, .avx },
+    .{ .vcmpps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x0f, 0xc2 }, 0, .vex_256_wig, .avx },
+
+    .{ .vcmpsd, .rvmi, &.{ .xmm, .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .vex_lig_wig, .avx },
+
+    .{ .vcmpss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .vex_lig_wig, .avx },
+
     .{ .vcvtdq2pd, .rm, &.{ .xmm, .xmm_m64  }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .vex_128_wig, .avx },
     .{ .vcvtdq2pd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .vex_256_wig, .avx },
 
src/arch/x86_64/Lower.zig
@@ -377,6 +377,7 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
         .r => inst.data.r.fixes,
         .rr => inst.data.rr.fixes,
         .rrr => inst.data.rrr.fixes,
+        .rrrr => inst.data.rrrr.fixes,
         .rrri => inst.data.rrri.fixes,
         .rri_s, .rri_u => inst.data.rri.fixes,
         .ri_s, .ri_u => inst.data.ri.fixes,
@@ -430,6 +431,12 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .reg = inst.data.rrr.r2 },
             .{ .reg = inst.data.rrr.r3 },
         },
+        .rrrr => &.{
+            .{ .reg = inst.data.rrrr.r1 },
+            .{ .reg = inst.data.rrrr.r2 },
+            .{ .reg = inst.data.rrrr.r3 },
+            .{ .reg = inst.data.rrrr.r4 },
+        },
         .rrri => &.{
             .{ .reg = inst.data.rrri.r1 },
             .{ .reg = inst.data.rrri.r2 },
src/arch/x86_64/Mir.zig
@@ -596,6 +596,16 @@ pub const Inst = struct {
         /// Replicate single floating-point values
         movsldup,
 
+        /// Blend packed single-precision floating-point values
+        /// Blend scalar single-precision floating-point values
+        /// Blend packed double-precision floating-point values
+        /// Blend scalar double-precision floating-point values
+        blend,
+        /// Variable blend packed single-precision floating-point values
+        /// Variable blend scalar single-precision floating-point values
+        /// Variable blend packed double-precision floating-point values
+        /// Variable blend scalar double-precision floating-point values
+        blendv,
         /// Extract packed floating-point values
         extract,
         /// Insert scalar single-precision floating-point value
@@ -651,6 +661,9 @@ pub const Inst = struct {
         /// Register, register, register operands.
         /// Uses `rrr` payload.
         rrr,
+        /// Register, register, register, register operands.
+        /// Uses `rrrr` payload.
+        rrrr,
         /// Register, register, register, immediate (byte) operands.
         /// Uses `rrri` payload.
         rrri,
@@ -870,6 +883,13 @@ pub const Inst = struct {
             r2: Register,
             r3: Register,
         },
+        rrrr: struct {
+            fixes: Fixes = ._,
+            r1: Register,
+            r2: Register,
+            r3: Register,
+            r4: Register,
+        },
         rrri: struct {
             fixes: Fixes = ._,
             r1: Register,
test/behavior/maximum_minimum.zig
@@ -24,7 +24,8 @@ test "@max" {
 
 test "@max on vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -72,7 +73,8 @@ test "@min" {
 
 test "@min for vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO