Commit 73a42953c9

Jacob Young <jacobly0@users.noreply.github.com>
2024-12-23 07:56:39
x86_64: 2 means better
1 parent b9c4400
Changed files (5)
src
test
behavior
x86_64
src/arch/x86_64/bits.zig
@@ -150,6 +150,31 @@ pub const Condition = enum(u5) {
             .nz_or_p => .z_and_np,
         };
     }
+
+    /// Returns the equivalent condition when the operands are swapped.
+    pub fn commute(cond: Condition) Condition {
+        return switch (cond) {
+            else => cond,
+            .a => .b,
+            .ae => .be,
+            .b => .a,
+            .be => .ae,
+            .c => .a,
+            .g => .l,
+            .ge => .le,
+            .l => .g,
+            .le => .ge,
+            .na => .nb,
+            .nae => .nbe,
+            .nb => .na,
+            .nbe => .nae,
+            .nc => .na,
+            .ng => .nl,
+            .nge => .nle,
+            .nl => .ng,
+            .nle => .nge,
+        };
+    }
 };
 
 pub const Register = enum(u7) {
src/arch/x86_64/CodeGen.zig
@@ -135,7 +135,8 @@ const Owner = union(enum) {
     }
 };
 
-const MaskKind = enum { sign, all };
+const MaskKind = enum(u1) { sign, all };
+const MaskInfo = packed struct { kind: MaskKind, inverted: bool, scalar: Memory.Size };
 
 pub const MCValue = union(enum) {
     /// No runtime bits. `void` types, empty structs, u0, enums with 1 tag, etc.
@@ -167,7 +168,7 @@ pub const MCValue = union(enum) {
     /// The value is a tuple { wrapped, overflow } where wrapped value is stored in the GP register.
     register_overflow: struct { reg: Register, eflags: Condition },
     /// The value is a bool vector stored in a vector register with a different scalar type.
-    register_mask: struct { reg: Register, kind: MaskKind, inverted: bool, scalar: Memory.Size },
+    register_mask: struct { reg: Register, info: MaskInfo },
     /// The value is in memory at a hard-coded address.
     /// If the type is a pointer, it means the pointer address is stored at this memory location.
     memory: u64,
@@ -509,11 +510,23 @@ pub const MCValue = union(enum) {
             .memory => |pl| try writer.print("[ds:0x{x}]", .{pl}),
             inline .eflags, .register => |pl| try writer.print("{s}", .{@tagName(pl)}),
             .register_pair => |pl| try writer.print("{s}:{s}", .{ @tagName(pl[1]), @tagName(pl[0]) }),
-            .register_triple => |pl| try writer.print("{s}:{s}:{s}", .{ @tagName(pl[2]), @tagName(pl[1]), @tagName(pl[0]) }),
-            .register_quadruple => |pl| try writer.print("{s}:{s}:{s}:{s}", .{ @tagName(pl[3]), @tagName(pl[2]), @tagName(pl[1]), @tagName(pl[0]) }),
+            .register_triple => |pl| try writer.print("{s}:{s}:{s}", .{
+                @tagName(pl[2]), @tagName(pl[1]), @tagName(pl[0]),
+            }),
+            .register_quadruple => |pl| try writer.print("{s}:{s}:{s}:{s}", .{
+                @tagName(pl[3]), @tagName(pl[2]), @tagName(pl[1]), @tagName(pl[0]),
+            }),
             .register_offset => |pl| try writer.print("{s} + 0x{x}", .{ @tagName(pl.reg), pl.off }),
-            .register_overflow => |pl| try writer.print("{s}:{s}", .{ @tagName(pl.eflags), @tagName(pl.reg) }),
-            .register_mask => |pl| try writer.print("mask({s},{}):{s}", .{ @tagName(pl.kind), pl.scalar, @tagName(pl.reg) }),
+            .register_overflow => |pl| try writer.print("{s}:{s}", .{
+                @tagName(pl.eflags),
+                @tagName(pl.reg),
+            }),
+            .register_mask => |pl| try writer.print("mask({s},{}):{c}{s}", .{
+                @tagName(pl.info.kind),
+                pl.info.scalar,
+                @as(u8, if (pl.info.inverted) '!' else ' '),
+                @tagName(pl.reg),
+            }),
             .load_symbol => |pl| try writer.print("[sym:{} + 0x{x}]", .{ pl.sym_index, pl.off }),
             .lea_symbol => |pl| try writer.print("sym:{} + 0x{x}", .{ pl.sym_index, pl.off }),
             .indirect => |pl| try writer.print("[{s} + 0x{x}]", .{ @tagName(pl.reg), pl.off }),
@@ -524,7 +537,9 @@ pub const MCValue = union(enum) {
             .load_tlv => |pl| try writer.print("[tlv:{d}]", .{pl}),
             .lea_tlv => |pl| try writer.print("tlv:{d}", .{pl}),
             .load_frame => |pl| try writer.print("[{} + 0x{x}]", .{ pl.index, pl.off }),
-            .elementwise_regs_then_frame => |pl| try writer.print("elementwise:{d}:[{} + 0x{x}]", .{ pl.regs, pl.frame_index, pl.frame_off }),
+            .elementwise_regs_then_frame => |pl| try writer.print("elementwise:{d}:[{} + 0x{x}]", .{
+                pl.regs, pl.frame_index, pl.frame_off,
+            }),
             .lea_frame => |pl| try writer.print("{} + 0x{x}", .{ pl.index, pl.off }),
             .reserved_frame => |pl| try writer.print("(dead:{})", .{pl}),
             .air_ref => |pl| try writer.print("(air:0x{x})", .{@intFromEnum(pl)}),
@@ -2390,13 +2405,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 
             .div_float, .div_trunc, .div_floor, .div_exact => try cg.airMulDivBinOp(inst),
 
-            .cmp_lt  => try cg.airCmp(inst, .lt),
-            .cmp_lte => try cg.airCmp(inst, .lte),
-            .cmp_eq  => try cg.airCmp(inst, .eq),
-            .cmp_gte => try cg.airCmp(inst, .gte),
-            .cmp_gt  => try cg.airCmp(inst, .gt),
-            .cmp_neq => try cg.airCmp(inst, .neq),
-
             .cmp_lt_errors_len => try cg.airCmpLtErrorsLen(inst),
 
             .bitcast          => try cg.airBitCast(inst),
@@ -2474,12 +2482,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
             .rem_optimized,
             .mod_optimized,
             .neg_optimized,
-            .cmp_lt_optimized,
-            .cmp_lte_optimized,
-            .cmp_eq_optimized,
-            .cmp_gte_optimized,
-            .cmp_gt_optimized,
-            .cmp_neq_optimized,
             .reduce_optimized,
             .int_from_float_optimized,
             => return cg.fail("TODO implement optimized float mode", .{}),
@@ -2512,148 +2514,313 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
                 var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
                 var res: [1]Temp = undefined;
-                try cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, pattern_sets: switch (air_tag) {
+                cg.select2(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (@as(Mir.Inst.Tag, switch (air_tag) {
                     else => unreachable,
-                    inline .bit_and, .bit_or, .xor => |ct_air_tag| {
-                        const mir_tag: Mir.Inst.Tag = switch (ct_air_tag) {
-                            else => unreachable,
-                            .bit_and => .@"and",
-                            .bit_or => .@"or",
-                            .xor => .xor,
-                        };
-                        break :pattern_sets &.{
-                            .{
-                                .required_features = &.{.avx2},
-                                .mir_tag = .{ .vp_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .ymm, .ymm, .mem } },
-                                    .{ .ops = &.{ .ymm, .mem, .ymm }, .commute = .{ 1, 2 } },
-                                    .{ .ops = &.{ .ymm, .ymm, .ymm } },
-                                },
-                            },
-                            .{
-                                .required_features = &.{.avx},
-                                .mir_tag = .{ .vp_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .xmm, .xmm, .mem } },
-                                    .{ .ops = &.{ .xmm, .mem, .xmm }, .commute = .{ 1, 2 } },
-                                    .{ .ops = &.{ .xmm, .xmm, .xmm } },
-                                },
-                            },
-                            .{
-                                .required_features = &.{.sse2},
-                                .mir_tag = .{ .p_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .xmm, .{ .implicit = 0 }, .mem } },
-                                    .{ .ops = &.{ .xmm, .mem, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .xmm, .{ .implicit = 0 }, .xmm } },
-                                    .{ .ops = &.{ .xmm, .xmm, .{ .implicit = 0 } } },
-                                },
-                            },
-                            .{
-                                .required_features = &.{.sse},
-                                .mir_tag = .{ ._ps, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .xmm, .{ .implicit = 0 }, .mem } },
-                                    .{ .ops = &.{ .xmm, .mem, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .xmm, .{ .implicit = 0 }, .xmm } },
-                                    .{ .ops = &.{ .xmm, .xmm, .{ .implicit = 0 } } },
-                                },
-                            },
-                            .{
-                                .required_features = &.{.mmx},
-                                .mir_tag = .{ .p_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .mm, .{ .implicit = 0 }, .mem } },
-                                    .{ .ops = &.{ .mm, .mem, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .mm, .{ .implicit = 0 }, .mm } },
-                                    .{ .ops = &.{ .mm, .mm, .{ .implicit = 0 } } },
-                                },
+                    .bit_and => .@"and",
+                    .bit_or => .@"or",
+                    .xor => .xor,
+                })) {
+                    else => unreachable,
+                    inline .@"and", .@"or", .xor => |mir_tag| comptime &.{ .{
+                        .required_features = .{ .avx2, null },
+                        .patterns = &.{
+                            .{ .src = .{ .ymm, .mem } },
+                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .ymm, .ymm } },
+                        },
+                        .dst_temps = .{.{ .rc = .sse }},
+                        .each = .{ .once = &.{
+                            .{ .vp_, mir_tag, .ydst0, .ysrc0, .ysrc1, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .ymm, .mem } },
+                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .ymm, .ymm } },
+                        },
+                        .dst_temps = .{.{ .rc = .sse }},
+                        .each = .{ .once = &.{
+                            .{ .v_pd, mir_tag, .ydst0, .ysrc0, .ysrc1, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .xmm, .mem } },
+                            .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .xmm, .xmm } },
+                        },
+                        .dst_temps = .{.{ .rc = .sse }},
+                        .each = .{ .once = &.{
+                            .{ .vp_, mir_tag, .xdst0, .xsrc0, .xsrc1, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_xmm, .xmm } },
+                        },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ .p_, mir_tag, .xdst0, .xsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_xmm, .xmm } },
+                        },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ ._ps, mir_tag, .xdst0, .xsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .mmx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_mm, .mem } },
+                            .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mm, .mm } },
+                        },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ .p_, mir_tag, .rdst0, .rsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_mem, .imm8 } },
+                            .{ .src = .{ .imm8, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .imm8 } },
+                            .{ .src = .{ .imm8, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .gpr } },
+                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ ._, mir_tag, .dst0b, .src1b, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_mem, .imm16 } },
+                            .{ .src = .{ .imm16, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .imm16 } },
+                            .{ .src = .{ .imm16, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .gpr } },
+                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ ._, mir_tag, .dst0w, .src1w, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_mem, .imm32 } },
+                            .{ .src = .{ .imm32, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .imm32 } },
+                            .{ .src = .{ .imm32, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .gpr } },
+                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ ._, mir_tag, .edst0, .esrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_mem, .simm32 } },
+                            .{ .src = .{ .simm32, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .simm32 } },
+                            .{ .src = .{ .simm32, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .gpr } },
+                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .src = 0 }},
+                        .each = .{ .once = &.{
+                            .{ ._, mir_tag, .rdst0, .rsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx2, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .ysrc0,
+                            .body = &.{
+                                .{ .v_, .movdqu, .ytmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .vp_, mir_tag, .ytmp1, .ytmp1, .{ .src_limb = 1 }, .none },
+                                .{ .v_, .movdqu, .{ .dst_limb = 0 }, .ytmp1, .none, .none },
                             },
-                            .{
-                                .clobbers = .{ .eflags = true },
-                                .mir_tag = .{ ._, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .mem, .{ .implicit = 0 }, .simm32 } },
-                                    .{ .ops = &.{ .mem, .simm32, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .mem, .{ .implicit = 0 }, .gpr } },
-                                    .{ .ops = &.{ .mem, .gpr, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .gpr, .{ .implicit = 0 }, .simm32 } },
-                                    .{ .ops = &.{ .gpr, .simm32, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .gpr, .{ .implicit = 0 }, .mem } },
-                                    .{ .ops = &.{ .gpr, .mem, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .gpr, .{ .implicit = 0 }, .gpr } },
-                                    .{ .ops = &.{ .gpr, .gpr, .{ .implicit = 0 } } },
-                                },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .ysrc0,
+                            .body = &.{
+                                .{ .v_pd, .movu, .ytmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .v_pd, mir_tag, .ytmp1, .ytmp1, .{ .src_limb = 1 }, .none },
+                                .{ .v_pd, .movu, .{ .dst_limb = 0 }, .ytmp1, .none, .none },
                             },
-
-                            .{
-                                .required_features = &.{.avx2},
-                                .loop = .bitwise,
-                                .mir_tag = .{ .vp_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .ymm_limb, .{ .explicit = 0 }, .mem_limb } },
-                                    .{ .ops = &.{ .ymm_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                    .{ .ops = &.{ .ymm_limb, .ymm_limb, .mem_limb } },
-                                    .{ .ops = &.{ .ymm_limb, .ymm_limb, .ymm_limb } },
-                                },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .body = &.{
+                                .{ .v_, .movdqu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .vp_, mir_tag, .xtmp1, .xtmp1, .{ .src_limb = 1 }, .none },
+                                .{ .v_, .movdqu, .{ .dst_limb = 0 }, .xtmp1, .none, .none },
                             },
-                            .{
-                                .required_features = &.{.avx},
-                                .loop = .bitwise,
-                                .mir_tag = .{ .vp_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .xmm_limb, .{ .explicit = 0 }, .mem_limb } },
-                                    .{ .ops = &.{ .xmm_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                    .{ .ops = &.{ .xmm_limb, .xmm_limb, .mem_limb } },
-                                    .{ .ops = &.{ .xmm_limb, .xmm_limb, .xmm_limb } },
-                                },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .body = &.{
+                                .{ ._, .movdqu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .p_, mir_tag, .xtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._, .movdqu, .{ .dst_limb = 0 }, .xtmp1, .none, .none },
                             },
-                            .{
-                                .required_features = &.{.sse2},
-                                .loop = .bitwise,
-                                .mir_tag = .{ .p_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .xmm_limb, .{ .implicit = 0 }, .mem_limb } },
-                                    .{ .ops = &.{ .xmm_limb, .mem_limb, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .xmm_limb, .{ .implicit = 0 }, .xmm_limb } },
-                                },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .body = &.{
+                                .{ ._ps, .movu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._ps, mir_tag, .xtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._ps, .movu, .{ .dst_limb = 0 }, .xtmp1, .none, .none },
                             },
-                            .{
-                                .required_features = &.{.sse},
-                                .loop = .bitwise,
-                                .mir_tag = .{ ._ps, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .xmm_limb, .{ .implicit = 0 }, .mem_limb } },
-                                    .{ .ops = &.{ .xmm_limb, .mem_limb, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .xmm_limb, .{ .implicit = 0 }, .xmm_limb } },
-                                },
+                        } },
+                    }, .{
+                        .required_features = .{ .mmx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .mmx } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .rsrc0,
+                            .body = &.{
+                                .{ ._q, .mov, .rtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .p_, mir_tag, .rtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._q, .mov, .{ .dst_limb = 0 }, .rtmp1, .none, .none },
                             },
-                            .{
-                                .required_features = &.{.mmx},
-                                .loop = .bitwise,
-                                .mir_tag = .{ .p_, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .mm_limb, .{ .implicit = 0 }, .mem_limb } },
-                                    .{ .ops = &.{ .mm_limb, .mem_limb, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .mm_limb, .{ .implicit = 0 }, .mm_limb } },
-                                },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .rsrc0,
+                            .body = &.{
+                                .{ ._, .mov, .rtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._, mir_tag, .rtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._, .mov, .{ .dst_limb = 0 }, .rtmp1, .none, .none },
                             },
-                            .{
-                                .clobbers = .{ .eflags = true },
-                                .loop = .bitwise,
-                                .mir_tag = .{ ._, mir_tag },
-                                .patterns = &.{
-                                    .{ .ops = &.{ .mem_limb, .{ .implicit = 0 }, .gpr_limb } },
-                                    .{ .ops = &.{ .mem_limb, .gpr_limb, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .gpr_limb, .{ .implicit = 0 }, .mem_limb } },
-                                    .{ .ops = &.{ .gpr_limb, .mem_limb, .{ .implicit = 0 } } },
-                                    .{ .ops = &.{ .gpr_limb, .{ .implicit = 0 }, .gpr_limb } },
-                                },
+                        } },
+                    }, .{
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                        },
+                        .dst_temps = .{.mem},
+                        .each = .{ .limb = .{
+                            .of = .esrc0,
+                            .body = &.{
+                                .{ ._, .mov, .etmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._, mir_tag, .etmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._, .mov, .{ .dst_limb = 0 }, .etmp1, .none, .none },
                             },
-                        };
-                    },
-                }, .{});
+                        } },
+                    } },
+                }) catch |err2| switch (err2) {
+                    error.Select2Failed => return cg.fail("failed to select2 {s} {} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(bin_op.lhs).fmt(pt),
+                        ops[0].tracking(cg),
+                        ops[1].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
                 if (ops[0].index != res[0].index) try ops[0].die(cg);
                 if (ops[1].index != res[0].index) try ops[1].die(cg);
                 try res[0].moveTo(inst, cg);
@@ -2720,425 +2887,1300 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 switch (extra.compareOperator()) {
                     .lt => unreachable,
                     .lte => unreachable,
-                    .eq, .neq => |cmp_op| try cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, &.{
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .byte },
-                            .mir_tag = .{ .vp_b, .cmpeq },
+                    .eq, .neq => |cmp_op| cg.select2(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (@as(Condition, switch (cmp_op) {
+                        else => unreachable,
+                        .eq => .e,
+                        .neq => .ne,
+                    })) {
+                        else => unreachable,
+                        inline .e, .ne => |cc| comptime &.{ .{
+                            .required_features = .{ .avx2, null },
+                            .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
                             .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask, .ymm, .mem } },
-                                .{ .ops = &.{ .ymm_mask, .mem, .ymm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask, .ymm, .ymm } },
+                                .{ .src = .{ .ymm, .mem } },
+                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .ymm, .ymm } },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .word },
-                            .mir_tag = .{ .vp_w, .cmpeq },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .byte,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_b, .cmpeq, .ydst0, .ysrc0, .ysrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx2, null },
+                            .constraints = .{ .{ .int = .word }, .{ .int = .word } },
                             .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask, .ymm, .mem } },
-                                .{ .ops = &.{ .ymm_mask, .mem, .ymm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask, .ymm, .ymm } },
+                                .{ .src = .{ .ymm, .mem } },
+                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .ymm, .ymm } },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .dword },
-                            .mir_tag = .{ .vp_d, .cmpeq },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .word,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_w, .cmpeq, .ydst0, .ysrc0, .ysrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx2, null },
+                            .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
                             .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask, .ymm, .mem } },
-                                .{ .ops = &.{ .ymm_mask, .mem, .ymm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask, .ymm, .ymm } },
+                                .{ .src = .{ .ymm, .mem } },
+                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .ymm, .ymm } },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .qword },
-                            .mir_tag = .{ .vp_q, .cmpeq },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .dword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_d, .cmpeq, .ydst0, .ysrc0, .ysrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx2, null },
+                            .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
                             .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask, .ymm, .mem } },
-                                .{ .ops = &.{ .ymm_mask, .mem, .ymm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask, .ymm, .ymm } },
+                                .{ .src = .{ .ymm, .mem } },
+                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .ymm, .ymm } },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .byte },
-                            .mir_tag = .{ .vp_b, .cmpeq },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .qword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_q, .cmpeq, .ydst0, .ysrc0, .ysrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx, null },
+                            .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
                             .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .xmm, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .xmm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .xmm } },
+                                .{ .src = .{ .xmm, .mem } },
+                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .xmm, .xmm } },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .word },
-                            .mir_tag = .{ .vp_w, .cmpeq },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .byte,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_b, .cmpeq, .xdst0, .xsrc0, .xsrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx, null },
+                            .constraints = .{ .{ .int = .word }, .{ .int = .word } },
                             .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .xmm, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .xmm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .xmm } },
+                                .{ .src = .{ .xmm, .mem } },
+                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .xmm, .xmm } },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .dword },
-                            .mir_tag = .{ .vp_d, .cmpeq },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .word,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_w, .cmpeq, .xdst0, .xsrc0, .xsrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx, null },
+                            .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
                             .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .xmm, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .xmm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .xmm } },
+                                .{ .src = .{ .xmm, .mem } },
+                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .xmm, .xmm } },
+                            },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .dword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_d, .cmpeq, .xdst0, .xsrc0, .xsrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx, null },
+                            .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .patterns = &.{
+                                .{ .src = .{ .xmm, .mem } },
+                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .xmm, .xmm } },
+                            },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .qword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .vp_q, .cmpeq, .xdst0, .xsrc0, .xsrc1, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .sse2, null },
+                            .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_xmm, .xmm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .byte,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_b, .cmpeq, .xdst0, .xsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .sse2, null },
+                            .constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_xmm, .xmm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .word,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_w, .cmpeq, .xdst0, .xsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .sse2, null },
+                            .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_xmm, .xmm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .dword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_d, .cmpeq, .xdst0, .xsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .sse4_1, null },
+                            .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_xmm, .xmm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .qword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_q, .cmpeq, .xdst0, .xsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .mmx, null },
+                            .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mm, .mem } },
+                                .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mm, .mm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .byte,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_b, .cmpeq, .rdst0, .rsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .mmx, null },
+                            .constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mm, .mem } },
+                                .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mm, .mm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .word,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_w, .cmpeq, .rdst0, .rsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .required_features = .{ .mmx, null },
+                            .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mm, .mem } },
+                                .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mm, .mm } },
+                            },
+                            .dst_temps = .{.{ .src_mask = .{ .src = 0, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .dword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ .p_d, .cmpeq, .rdst0, .rsrc1, .none, .none },
+                            } },
+                        }, .{
+                            .constraints = .{ .{ .bool_vec = .byte }, .{ .bool_vec = .byte } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mem, .imm8 } },
+                                .{ .src = .{ .imm8, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .imm8 } },
+                                .{ .src = .{ .imm8, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .gpr } },
+                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .gpr } },
+                            },
+                            .clobbers = .{ .eflags = true },
+                            .dst_temps = .{.{ .src = 0 }},
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, .xor, .dst0b, .src1b, .none, .none },
+                                    .{ ._, .not, .dst0b, .none, .none, .none },
+                                },
+                                .ne => &.{
+                                    .{ ._, .xor, .dst0b, .src1b, .none, .none },
+                                },
+                            } },
+                        }, .{
+                            .constraints = .{ .{ .bool_vec = .word }, .{ .bool_vec = .word } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mem, .imm16 } },
+                                .{ .src = .{ .imm16, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .imm16 } },
+                                .{ .src = .{ .imm16, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .gpr } },
+                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .gpr } },
+                            },
+                            .clobbers = .{ .eflags = true },
+                            .dst_temps = .{.{ .src = 0 }},
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, .xor, .dst0w, .src1w, .none, .none },
+                                    .{ ._, .not, .dst0w, .none, .none, .none },
+                                },
+                                .ne => &.{
+                                    .{ ._, .xor, .dst0w, .src1w, .none, .none },
+                                },
+                            } },
+                        }, .{
+                            .constraints = .{ .{ .bool_vec = .dword }, .{ .bool_vec = .dword } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mem, .imm32 } },
+                                .{ .src = .{ .imm32, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .imm32 } },
+                                .{ .src = .{ .imm32, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .gpr } },
+                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .gpr } },
+                            },
+                            .clobbers = .{ .eflags = true },
+                            .dst_temps = .{.{ .src = 0 }},
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, .xor, .edst0, .esrc1, .none, .none },
+                                    .{ ._, .not, .edst0, .none, .none, .none },
+                                },
+                                .ne => &.{
+                                    .{ ._, .xor, .edst0, .esrc1, .none, .none },
+                                },
+                            } },
+                        }, .{
+                            .required_features = .{ .@"64bit", null },
+                            .constraints = .{ .{ .bool_vec = .qword }, .{ .bool_vec = .qword } },
+                            .patterns = &.{
+                                .{ .src = .{ .mut_mem, .simm32 } },
+                                .{ .src = .{ .simm32, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .simm32 } },
+                                .{ .src = .{ .simm32, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .gpr } },
+                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_gpr, .gpr } },
+                            },
+                            .clobbers = .{ .eflags = true },
+                            .dst_temps = .{.{ .src = 0 }},
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, .xor, .rdst0, .rsrc1, .none, .none },
+                                    .{ ._, .not, .rdst0, .none, .none, .none },
+                                },
+                                .ne => &.{
+                                    .{ ._, .xor, .rdst0, .rsrc1, .none, .none },
+                                },
+                            } },
+                        } },
+                    }) catch |err2| switch (err2) {
+                        error.Select2Failed => cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, &.{
+                            .{
+                                .required_features = &.{.avx2},
+                                .scalar = .{ .any_int = .byte },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_b, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx2},
+                                .scalar = .{ .any_int = .word },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_w, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx2},
+                                .scalar = .{ .any_int = .dword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_d, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx2},
+                                .scalar = .{ .any_int = .qword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_q, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx},
+                                .scalar = .{ .any_int = .byte },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_b, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx},
+                                .scalar = .{ .any_int = .word },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_w, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx},
+                                .scalar = .{ .any_int = .dword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_d, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.avx},
+                                .scalar = .{ .any_int = .qword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .vp_q, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.sse2},
+                                .scalar = .{ .any_int = .byte },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_b, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.sse2},
+                                .scalar = .{ .any_int = .word },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_w, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.sse2},
+                                .scalar = .{ .any_int = .dword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_d, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.sse4_1},
+                                .scalar = .{ .any_int = .qword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_q, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.mmx},
+                                .scalar = .{ .any_int = .byte },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_b, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .mm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.mmx},
+                                .scalar = .{ .any_int = .word },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_w, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .mm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mm_limb } },
+                                },
+                            },
+                            .{
+                                .required_features = &.{.mmx},
+                                .scalar = .{ .any_int = .dword },
+                                .loop = .elementwise,
+                                .mir_tag = .{ .p_d, .cmpeq },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .mm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mm_limb } },
+                                },
+                            },
+                            .{
+                                .scalar = .bool,
+                                .clobbers = .{ .eflags = true },
+                                .invert_result = true,
+                                .loop = .elementwise,
+                                .mir_tag = .{ ._, .xor },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .mem_limb, .{ .implicit = 0 }, .gpr_limb } },
+                                    .{ .ops = &.{ .mem_limb, .gpr_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .gpr_limb, .{ .implicit = 0 }, .mem_limb } },
+                                    .{ .ops = &.{ .gpr_limb, .mem_limb, .{ .implicit = 0 } } },
+                                    .{ .ops = &.{ .gpr_limb, .{ .implicit = 0 }, .gpr_limb } },
+                                },
+                            },
+                            .{
+                                .scalar = .{ .any_int = .byte },
+                                .clobbers = .{ .eflags = true },
+                                .loop = .elementwise,
+                                .mir_tag = .{ ._, .cmp },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .cc_elem, .mem_elem, .gpr_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .mem_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .gpr_elem } },
+                                },
+                            },
+                            .{
+                                .scalar = .{ .any_int = .word },
+                                .clobbers = .{ .eflags = true },
+                                .loop = .elementwise,
+                                .mir_tag = .{ ._, .cmp },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .cc_elem, .mem_elem, .gpr_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .mem_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .gpr_elem } },
+                                },
+                            },
+                            .{
+                                .scalar = .{ .any_int = .dword },
+                                .clobbers = .{ .eflags = true },
+                                .loop = .elementwise,
+                                .mir_tag = .{ ._, .cmp },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .cc_elem, .mem_elem, .gpr_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .mem_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .gpr_elem } },
+                                },
+                            },
+                            .{
+                                .scalar = .{ .any_int = .qword },
+                                .clobbers = .{ .eflags = true },
+                                .loop = .elementwise,
+                                .mir_tag = .{ ._, .cmp },
+                                .patterns = &.{
+                                    .{ .ops = &.{ .cc_elem, .mem_elem, .gpr_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .mem_elem } },
+                                    .{ .ops = &.{ .cc_elem, .gpr_elem, .gpr_elem } },
+                                },
+                            },
+                        }, .{
+                            .cc = .e,
+                            .invert_result = switch (cmp_op) {
+                                .eq => false,
+                                .neq => true,
+                                else => unreachable,
+                            },
+                        }) catch |err| switch (err) {
+                            error.SelectFailed => return cg.fail("failed to select", .{}),
+                            else => |e| return e,
+                        },
+                        else => |e| return e,
+                    },
+                    .gte => unreachable,
+                    .gt => unreachable,
+                }
+                if (ops[0].index != res[0].index) try ops[0].die(cg);
+                if (ops[1].index != res[0].index) try ops[1].die(cg);
+                try res[0].moveTo(inst, cg);
+            },
+
+            .cmp_lt,
+            .cmp_lt_optimized,
+            .cmp_lte,
+            .cmp_lte_optimized,
+            .cmp_gte,
+            .cmp_gte_optimized,
+            .cmp_gt,
+            .cmp_gt_optimized,
+            => |air_tag| if (use_old) try cg.airCmp(inst, switch (air_tag) {
+                else => unreachable,
+                .cmp_lt, .cmp_lt_optimized => .lt,
+                .cmp_lte, .cmp_lte_optimized => .lte,
+                .cmp_gte, .cmp_gte_optimized => .gte,
+                .cmp_gt, .cmp_gt_optimized => .gt,
+            }) else {
+                const bin_op = air_datas[@intFromEnum(inst)].bin_op;
+                const scalar_ty = cg.typeOf(bin_op.lhs).scalarType(zcu);
+                const signedness = if (scalar_ty.isAbiInt(zcu))
+                    scalar_ty.intInfo(zcu).signedness
+                else
+                    .unsigned;
+                var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
+                var res: [1]Temp = undefined;
+                cg.select2(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (@as(Condition, switch (signedness) {
+                    .signed => switch (air_tag) {
+                        else => unreachable,
+                        .cmp_lt, .cmp_lt_optimized => .l,
+                        .cmp_lte, .cmp_lte_optimized => .le,
+                        .cmp_gte, .cmp_gte_optimized => .ge,
+                        .cmp_gt, .cmp_gt_optimized => .g,
+                    },
+                    .unsigned => switch (air_tag) {
+                        else => unreachable,
+                        .cmp_lt, .cmp_lt_optimized => .b,
+                        .cmp_lte, .cmp_lte_optimized => .be,
+                        .cmp_gte, .cmp_gte_optimized => .ae,
+                        .cmp_gt, .cmp_gt_optimized => .a,
+                    },
+                })) {
+                    else => unreachable,
+                    inline .l, .le, .ge, .g, .b, .be, .ae, .a => |cc| comptime &.{ .{
+                        .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                        .patterns = &.{
+                            .{ .src = .{ .imm8, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .imm8, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc.commute() }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .src0b, .src1b, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .imm8 } },
+                            .{ .src = .{ .gpr, .imm8 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .src0b, .src1b, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                        .patterns = &.{
+                            .{ .src = .{ .imm16, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .imm16, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc.commute() }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .src0w, .src1w, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .imm16 } },
+                            .{ .src = .{ .gpr, .imm16 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .src0w, .src1w, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                        .patterns = &.{
+                            .{ .src = .{ .imm32, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .imm32, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc.commute() }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .esrc0, .esrc1, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .imm32 } },
+                            .{ .src = .{ .gpr, .imm32 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .esrc0, .esrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                        .patterns = &.{
+                            .{ .src = .{ .simm32, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .simm32, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc.commute() }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .rsrc0, .rsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .simm32 } },
+                            .{ .src = .{ .gpr, .simm32 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .rsrc0, .rsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .bool, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .rc = .general_purpose }},
+                        .each = .{ .limb = .{
+                            .of = .rsrc0,
+                            .header = &.{
+                                .{ ._, .xor, .tmp1b, .tmp1b, .none, .none },
+                            },
+                            .body = &.{
+                                .{ ._, .mov, .rtmp0, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._r, .sh, .tmp1b, .{ .simm32 = 1 }, .none, .none },
+                                .{ ._, .sbb, .rtmp0, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._c, .set, .tmp1b, .none, .none, .none },
+                                .{ .fromCondition(cc), .set, .dst0b, .none, .none, .none },
+                            },
+                        } },
+                    }, .{
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .bool, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .rc = .general_purpose }},
+                        .each = .{ .limb = .{
+                            .of = .esrc0,
+                            .header = &.{
+                                .{ ._, .xor, .tmp1b, .tmp1b, .none, .none },
+                            },
+                            .body = &.{
+                                .{ ._, .mov, .etmp0, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._r, .sh, .tmp1b, .{ .simm32 = 1 }, .none, .none },
+                                .{ ._, .sbb, .etmp0, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._c, .set, .tmp1b, .none, .none, .none },
+                                .{ .fromCondition(cc), .set, .dst0b, .none, .none, .none },
+                            },
+                        } },
+                    } },
+                }) catch |err| switch (err) {
+                    error.Select2Failed => return cg.fail("failed to select2 {s} {} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(bin_op.lhs).fmt(pt),
+                        ops[0].tracking(cg),
+                        ops[1].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
+                if (ops[0].index != res[0].index) try ops[0].die(cg);
+                if (ops[1].index != res[0].index) try ops[1].die(cg);
+                try res[0].moveTo(inst, cg);
+            },
+            .cmp_eq, .cmp_eq_optimized, .cmp_neq, .cmp_neq_optimized => |air_tag| if (use_old) try cg.airCmp(inst, switch (air_tag) {
+                else => unreachable,
+                .cmp_eq, .cmp_eq_optimized => .eq,
+                .cmp_neq, .cmp_neq_optimized => .neq,
+            }) else fallback: {
+                const bin_op = air_datas[@intFromEnum(inst)].bin_op;
+                if (ip.isOptionalType(cg.typeOf(bin_op.lhs).toIntern())) break :fallback try cg.airCmp(inst, switch (air_tag) {
+                    else => unreachable,
+                    .cmp_eq, .cmp_eq_optimized => .eq,
+                    .cmp_neq, .cmp_neq_optimized => .neq,
+                });
+                var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
+                var res: [1]Temp = undefined;
+                cg.select2(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (@as(Condition, switch (air_tag) {
+                    else => unreachable,
+                    .cmp_eq, .cmp_eq_optimized => .e,
+                    .cmp_neq, .cmp_neq_optimized => .ne,
+                })) {
+                    else => unreachable,
+                    inline .e, .ne => |cc| comptime &.{ .{
+                        .required_features = .{ .avx2, null },
+                        .constraints = .{ .any_int, .any_int },
+                        .patterns = &.{
+                            .{ .src = .{ .ymm, .mem } },
+                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .ymm, .ymm } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .extra_temps = .{ .{ .kind = .{ .rc = .sse } }, .unused, .unused },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ .vp_, .xor, .ytmp0, .ysrc0, .ysrc1, .none },
+                            .{ .vp_, .@"test", .ytmp0, .ytmp0, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .constraints = .{ .any_int, .any_int },
+                        .patterns = &.{
+                            .{ .src = .{ .ymm, .mem } },
+                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .ymm, .ymm } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .extra_temps = .{ .{ .kind = .{ .rc = .sse } }, .unused, .unused },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ .v_pd, .xor, .ytmp0, .ysrc0, .ysrc1, .none },
+                            .{ .vp_, .@"test", .ytmp0, .ytmp0, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .constraints = .{ .any_int, .any_int },
+                        .patterns = &.{
+                            .{ .src = .{ .xmm, .mem } },
+                            .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .xmm, .xmm } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .extra_temps = .{ .{ .kind = .{ .rc = .sse } }, .unused, .unused },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ .vp_, .xor, .xtmp0, .xsrc0, .xsrc1, .none },
+                            .{ .vp_, .@"test", .xtmp0, .xtmp0, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse4_1, null },
+                        .constraints = .{ .any_int, .any_int },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_xmm, .xmm } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ .p_, .xor, .xsrc0, .xsrc1, .none, .none },
+                            .{ .p_, .@"test", .xsrc0, .xsrc0, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null },
+                        .constraints = .{ .any_int, .any_int },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_xmm, .xmm } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                        },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ .p_, .xor, .xtmp1, .xtmp1, .none, .none },
+                            .{ .p_, .xor, .xsrc0, .xsrc1, .none, .none },
+                            .{ .p_b, .cmpeq, .xtmp1, .xsrc0, .none, .none },
+                            .{ .p_b, .movmsk, .etmp0, .xtmp1, .none, .none },
+                            .{ ._, .xor, .etmp0, .{ .simm32 = std.math.maxInt(u16) }, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, .mmx },
+                        .constraints = .{ .any_int, .any_int },
+                        .patterns = &.{
+                            .{ .src = .{ .mut_mm, .mem } },
+                            .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mm, .mm } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .mmx } },
+                            .unused,
+                        },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ .p_, .xor, .rtmp1, .rtmp1, .none, .none },
+                            .{ .p_, .xor, .rsrc0, .rsrc1, .none, .none },
+                            .{ .p_b, .cmpeq, .rtmp1, .rsrc0, .none, .none },
+                            .{ .p_b, .movmsk, .etmp0, .rtmp1, .none, .none },
+                            .{ ._, .xor, .etmp0, .{ .simm32 = std.math.maxInt(u8) }, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .imm8 } },
+                            .{ .src = .{ .imm8, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .imm8 } },
+                            .{ .src = .{ .imm8, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .src0b, .src1b, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .imm16 } },
+                            .{ .src = .{ .imm16, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .imm16 } },
+                            .{ .src = .{ .imm16, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .src0w, .src1w, .none, .none },
+                        } },
+                    }, .{
+                        .constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .imm32 } },
+                            .{ .src = .{ .imm32, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .imm32 } },
+                            .{ .src = .{ .imm32, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .esrc0, .esrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .simm32 } },
+                            .{ .src = .{ .simm32, .mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .simm32 } },
+                            .{ .src = .{ .simm32, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .mem } },
+                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .gpr, .gpr } },
+                        },
+                        .clobbers = .{ .eflags = true },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .once = &.{
+                            .{ ._, .cmp, .rsrc0, .rsrc1, .none, .none },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx2, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
+                        },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .ysrc0,
+                            .header = &.{
+                                .{ .vp_, .xor, .ytmp2, .ytmp2, .ytmp2, .none },
+                            },
+                            .body = &.{
+                                .{ .v_, .movdqu, .ytmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .vp_, .xor, .ytmp1, .ytmp1, .{ .src_limb = 1 }, .none },
+                                .{ .vp_, .@"or", .ytmp2, .ytmp2, .ytmp1, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .qword },
-                            .mir_tag = .{ .vp_q, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .xmm, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .xmm }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .xmm } },
+                            .trailer = &.{
+                                .{ .vp_, .@"test", .ytmp2, .ytmp2, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-                        .{
-                            .required_features = &.{.sse2},
-                            .scalar = .{ .any_int = .byte },
-                            .mir_tag = .{ .p_b, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .xmm } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .{ .implicit = 0 } } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
                         },
-                        .{
-                            .required_features = &.{.sse2},
-                            .scalar = .{ .any_int = .word },
-                            .mir_tag = .{ .p_w, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .xmm } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .{ .implicit = 0 } } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .ysrc0,
+                            .header = &.{
+                                .{ .v_pd, .xor, .ytmp2, .ytmp2, .ytmp2, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.sse2},
-                            .scalar = .{ .any_int = .dword },
-                            .mir_tag = .{ .p_d, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .xmm } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .{ .implicit = 0 } } },
+                            .body = &.{
+                                .{ .v_pd, .movu, .ytmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .v_pd, .xor, .ytmp1, .ytmp1, .{ .src_limb = 1 }, .none },
+                                .{ .v_pd, .@"or", .ytmp2, .ytmp2, .ytmp1, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.sse4_1},
-                            .scalar = .{ .any_int = .qword },
-                            .mir_tag = .{ .p_q, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .xmm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask, .{ .implicit = 0 }, .xmm } },
-                                .{ .ops = &.{ .xmm_mask, .xmm, .{ .implicit = 0 } } },
+                            .trailer = &.{
+                                .{ .vp_, .@"test", .ytmp2, .ytmp2, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-                        .{
-                            .required_features = &.{.mmx},
-                            .scalar = .{ .any_int = .byte },
-                            .mir_tag = .{ .p_b, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .mm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mm_mask, .{ .implicit = 0 }, .mm } },
-                                .{ .ops = &.{ .mm_mask, .mm, .{ .implicit = 0 } } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
                         },
-                        .{
-                            .required_features = &.{.mmx},
-                            .scalar = .{ .any_int = .word },
-                            .mir_tag = .{ .p_w, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .mm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mm_mask, .{ .implicit = 0 }, .mm } },
-                                .{ .ops = &.{ .mm_mask, .mm, .{ .implicit = 0 } } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .header = &.{
+                                .{ .vp_, .xor, .xtmp2, .xtmp2, .xtmp2, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.mmx},
-                            .scalar = .{ .any_int = .dword },
-                            .mir_tag = .{ .p_d, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mm_mask, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .mm_mask, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mm_mask, .{ .implicit = 0 }, .mm } },
-                                .{ .ops = &.{ .mm_mask, .mm, .{ .implicit = 0 } } },
+                            .body = &.{
+                                .{ .v_, .movdqu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .vp_, .xor, .xtmp1, .xtmp1, .{ .src_limb = 1 }, .none },
+                                .{ .vp_, .@"or", .xtmp2, .xtmp2, .xtmp1, .none },
                             },
-                        },
-                        .{
-                            .scalar = .bool,
-                            .clobbers = .{ .eflags = true },
-                            .invert_result = true,
-                            .mir_tag = .{ ._, .xor },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mem, .{ .implicit = 0 }, .simm32 } },
-                                .{ .ops = &.{ .mem, .simm32, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mem, .{ .implicit = 0 }, .gpr } },
-                                .{ .ops = &.{ .mem, .gpr, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .gpr, .{ .implicit = 0 }, .simm32 } },
-                                .{ .ops = &.{ .gpr, .simm32, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .gpr, .{ .implicit = 0 }, .mem } },
-                                .{ .ops = &.{ .gpr, .mem, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .gpr, .{ .implicit = 0 }, .gpr } },
-                                .{ .ops = &.{ .gpr, .gpr, .{ .implicit = 0 } } },
+                            .trailer = &.{
+                                .{ .vp_, .@"test", .xtmp2, .xtmp2, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .byte },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_b, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
                         },
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .word },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_w, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .header = &.{
+                                .{ .vp_, .xor, .xtmp2, .xtmp2, .xtmp2, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .dword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_d, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                            .body = &.{
+                                .{ .v_, .movdqu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .vp_, .xor, .xtmp1, .xtmp1, .{ .src_limb = 1 }, .none },
+                                .{ .vp_, .@"or", .xtmp2, .xtmp2, .xtmp1, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx2},
-                            .scalar = .{ .any_int = .qword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_q, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .ymm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .mem_limb } },
-                                .{ .ops = &.{ .ymm_mask_limb, .ymm_limb, .ymm_limb } },
+                            .trailer = &.{
+                                .{ .vp_, .@"test", .xtmp2, .xtmp2, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse4_1, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .byte },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_b, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
                         },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .word },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_w, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .header = &.{
+                                .{ .p_, .xor, .xtmp2, .xtmp2, .none, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .dword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_d, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                            .body = &.{
+                                .{ ._, .movdqu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .p_, .xor, .xtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ .p_, .@"or", .xtmp2, .xtmp1, .none, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.avx},
-                            .scalar = .{ .any_int = .qword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .vp_q, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .explicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .explicit = 0 } }, .commute = .{ 1, 2 } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .xmm_limb, .xmm_limb } },
+                            .trailer = &.{
+                                .{ .p_, .@"test", .xtmp2, .xtmp2, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-                        .{
-                            .required_features = &.{.sse2},
-                            .scalar = .{ .any_int = .byte },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_b, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
                         },
-                        .{
-                            .required_features = &.{.sse2},
-                            .scalar = .{ .any_int = .word },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_w, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .xsrc0,
+                            .header = &.{
+                                .{ .p_, .xor, .xtmp2, .xtmp2, .none, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.sse2},
-                            .scalar = .{ .any_int = .dword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_d, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                            .body = &.{
+                                .{ ._, .movdqu, .xtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .p_, .xor, .xtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ .p_, .@"or", .xtmp2, .xtmp1, .none, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.sse4_1},
-                            .scalar = .{ .any_int = .qword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_q, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .xmm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .xmm_mask_limb, .{ .implicit = 0 }, .xmm_limb } },
+                            .trailer = &.{
+                                .{ .p_, .xor, .xtmp1, .xtmp1, .none, .none },
+                                .{ .p_b, .cmpeq, .xtmp2, .xtmp1, .none, .none },
+                                .{ .p_b, .movmsk, .etmp0, .xtmp2, .none, .none },
+                                .{ ._, .xor, .etmp0, .{ .simm32 = std.math.maxInt(u16) }, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, .mmx },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-                        .{
-                            .required_features = &.{.mmx},
-                            .scalar = .{ .any_int = .byte },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_b, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .mm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mm_limb } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .mmx } },
+                            .{ .kind = .{ .rc = .mmx } },
                         },
-                        .{
-                            .required_features = &.{.mmx},
-                            .scalar = .{ .any_int = .word },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_w, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .mm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mm_limb } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .rsrc0,
+                            .header = &.{
+                                .{ .p_, .xor, .rtmp2, .rtmp2, .none, .none },
                             },
-                        },
-                        .{
-                            .required_features = &.{.mmx},
-                            .scalar = .{ .any_int = .dword },
-                            .loop = .elementwise,
-                            .mir_tag = .{ .p_d, .cmpeq },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .mm_mask_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .mm_mask_limb, .{ .implicit = 0 }, .mm_limb } },
+                            .body = &.{
+                                .{ ._q, .mov, .rtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ .p_, .xor, .rtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ .p_, .@"or", .rtmp2, .rtmp1, .none, .none },
                             },
-                        },
-                        .{
-                            .scalar = .bool,
-                            .clobbers = .{ .eflags = true },
-                            .invert_result = true,
-                            .loop = .elementwise,
-                            .mir_tag = .{ ._, .xor },
-                            .patterns = &.{
-                                .{ .ops = &.{ .mem_limb, .{ .implicit = 0 }, .gpr_limb } },
-                                .{ .ops = &.{ .mem_limb, .gpr_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .gpr_limb, .{ .implicit = 0 }, .mem_limb } },
-                                .{ .ops = &.{ .gpr_limb, .mem_limb, .{ .implicit = 0 } } },
-                                .{ .ops = &.{ .gpr_limb, .{ .implicit = 0 }, .gpr_limb } },
+                            .trailer = &.{
+                                .{ .p_, .xor, .rtmp1, .rtmp1, .none, .none },
+                                .{ .p_b, .cmpeq, .rtmp2, .rtmp1, .none, .none },
+                                .{ .p_b, .movmsk, .etmp0, .rtmp2, .none, .none },
+                                .{ ._, .xor, .etmp0, .{ .simm32 = std.math.maxInt(u8) }, .none, .none },
                             },
+                        } },
+                    }, .{
+                        .required_features = .{ .@"64bit", null },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
-                        .{
-                            .scalar = .{ .any_int = .byte },
-                            .clobbers = .{ .eflags = true },
-                            .loop = .elementwise,
-                            .mir_tag = .{ ._, .cmp },
-                            .patterns = &.{
-                                .{ .ops = &.{ .cc_mask_limb, .mem_limb, .gpr_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .mem_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .gpr_limb } },
-                            },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
                         },
-                        .{
-                            .scalar = .{ .any_int = .word },
-                            .clobbers = .{ .eflags = true },
-                            .loop = .elementwise,
-                            .mir_tag = .{ ._, .cmp },
-                            .patterns = &.{
-                                .{ .ops = &.{ .cc_mask_limb, .mem_limb, .gpr_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .mem_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .gpr_limb } },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .rsrc0,
+                            .header = &.{
+                                .{ ._, .xor, .rtmp2, .rtmp2, .none, .none },
                             },
-                        },
-                        .{
-                            .scalar = .{ .any_int = .dword },
-                            .clobbers = .{ .eflags = true },
-                            .loop = .elementwise,
-                            .mir_tag = .{ ._, .cmp },
-                            .patterns = &.{
-                                .{ .ops = &.{ .cc_mask_limb, .mem_limb, .gpr_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .mem_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .gpr_limb } },
+                            .body = &.{
+                                .{ ._, .mov, .rtmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._, .xor, .rtmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._, .@"or", .rtmp2, .rtmp1, .none, .none },
                             },
-                        },
-                        .{
-                            .scalar = .{ .any_int = .qword },
-                            .clobbers = .{ .eflags = true },
-                            .loop = .elementwise,
-                            .mir_tag = .{ ._, .cmp },
-                            .patterns = &.{
-                                .{ .ops = &.{ .cc_mask_limb, .mem_limb, .gpr_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .mem_limb } },
-                                .{ .ops = &.{ .cc_mask_limb, .gpr_limb, .gpr_limb } },
+                            .trailer = &.{
+                                .{ ._, .@"test", .rtmp2, .rtmp2, .none, .none },
                             },
-                        },
+                        } },
                     }, .{
-                        .cc = .e,
-                        .invert_result = switch (cmp_op) {
-                            .eq => false,
-                            .neq => true,
-                            else => unreachable,
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
                         },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .each = .{ .limb = .{
+                            .of = .esrc0,
+                            .header = &.{
+                                .{ ._, .xor, .etmp2, .etmp2, .none, .none },
+                            },
+                            .body = &.{
+                                .{ ._, .mov, .etmp1, .{ .src_limb = 0 }, .none, .none },
+                                .{ ._, .xor, .etmp1, .{ .src_limb = 1 }, .none, .none },
+                                .{ ._, .@"or", .etmp2, .etmp1, .none, .none },
+                            },
+                            .trailer = &.{
+                                .{ ._, .@"test", .etmp2, .etmp2, .none, .none },
+                            },
+                        } },
+                    } },
+                }) catch |err| switch (err) {
+                    error.Select2Failed => return cg.fail("failed to select2 {s} {} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(bin_op.lhs).fmt(pt),
+                        ops[0].tracking(cg),
+                        ops[1].tracking(cg),
                     }),
-                    .gte => unreachable,
-                    .gt => unreachable,
-                }
+                    else => |e| return e,
+                };
                 if (ops[0].index != res[0].index) try ops[0].die(cg);
                 if (ops[1].index != res[0].index) try ops[1].die(cg);
                 try res[0].moveTo(inst, cg);
@@ -13850,7 +14892,6 @@ fn airCmp(self: *CodeGen, inst: Air.Inst.Index, op: std.math.CompareOperator) !v
                                     .undef,
                                     .immediate,
                                     .eflags,
-                                    .register,
                                     .register_offset,
                                     .register_overflow,
                                     .register_mask,
@@ -13864,7 +14905,7 @@ fn airCmp(self: *CodeGen, inst: Air.Inst.Index, op: std.math.CompareOperator) !v
                                     .reserved_frame,
                                     .air_ref,
                                     => unreachable,
-                                    .register_pair, .register_triple, .register_quadruple, .load_frame => null,
+                                    .register, .register_pair, .register_triple, .register_quadruple, .load_frame => null,
                                     .memory, .load_symbol, .load_got, .load_direct, .load_tlv => dst: {
                                         switch (resolved_dst_mcv) {
                                             .memory => |addr| if (std.math.cast(
@@ -13893,8 +14934,7 @@ fn airCmp(self: *CodeGen, inst: Air.Inst.Index, op: std.math.CompareOperator) !v
                                         };
                                     },
                                 };
-                                defer if (dst_info) |info|
-                                    self.register_manager.unlockReg(info.addr_lock);
+                                defer if (dst_info) |info| self.register_manager.unlockReg(info.addr_lock);
 
                                 const resolved_src_mcv = switch (src_mcv) {
                                     else => src_mcv,
@@ -16299,15 +17339,15 @@ fn genSetReg(
             const bits_lock = self.register_manager.lockReg(bits_reg);
             defer if (bits_lock) |lock| self.register_manager.unlockReg(lock);
 
-            const pack_reg = switch (src_reg_mask.scalar) {
+            const pack_reg = switch (src_reg_mask.info.scalar) {
                 else => src_reg_mask.reg,
                 .word => try self.register_manager.allocReg(null, abi.RegisterClass.sse),
             };
             const pack_lock = self.register_manager.lockReg(pack_reg);
             defer if (pack_lock) |lock| self.register_manager.unlockReg(lock);
 
-            var mask_size: u32 = @intCast(ty.vectorLen(zcu) * @divExact(src_reg_mask.scalar.bitSize(), 8));
-            switch (src_reg_mask.scalar) {
+            var mask_size: u32 = @intCast(ty.vectorLen(zcu) * @divExact(src_reg_mask.info.scalar.bitSize(), 8));
+            switch (src_reg_mask.info.scalar) {
                 else => {},
                 .word => {
                     const src_alias = registerAlias(src_reg_mask.reg, mask_size);
@@ -16321,13 +17361,13 @@ fn genSetReg(
                     mask_size = std.math.divCeil(u32, mask_size, 2) catch unreachable;
                 },
             }
-            try self.asmRegisterRegister(.{ switch (src_reg_mask.scalar) {
+            try self.asmRegisterRegister(.{ switch (src_reg_mask.info.scalar) {
                 .byte, .word => if (has_avx) .vp_b else .p_b,
                 .dword => if (has_avx) .v_ps else ._ps,
                 .qword => if (has_avx) .v_pd else ._pd,
                 else => unreachable,
             }, .movmsk }, bits_reg.to32(), registerAlias(pack_reg, mask_size));
-            if (src_reg_mask.inverted) try self.asmRegister(.{ ._, .not }, registerAlias(bits_reg, abi_size));
+            if (src_reg_mask.info.inverted) try self.asmRegister(.{ ._, .not }, registerAlias(bits_reg, abi_size));
             try self.genSetReg(dst_reg, ty, .{ .register = bits_reg }, .{});
         },
         .memory, .load_symbol, .load_direct, .load_got, .load_tlv => {
@@ -21239,18 +22279,14 @@ const Temp = struct {
         return true;
     }
 
-    fn toRegClass(temp: *Temp, rc: Register.Class, cg: *CodeGen) !bool {
-        const val, const ty = switch (temp.unwrap(cg)) {
-            .ref => |ref| .{ temp.tracking(cg).short, cg.typeOf(ref) },
-            .temp => |temp_index| val: {
-                const temp_tracking = temp_index.tracking(cg);
-                switch (temp_tracking.short) {
-                    else => {},
-                    .register => |reg| if (reg.class() == rc) return false,
-                }
-                break :val .{ temp_tracking.short, temp_index.typeOf(cg) };
-            },
+    fn toRegClass(temp: *Temp, mut: bool, rc: Register.Class, cg: *CodeGen) !bool {
+        const val = temp.tracking(cg).short;
+        if (!mut or temp.isMut(cg)) switch (val) {
+            else => {},
+            .register => |reg| if (reg.class() == rc) return false,
+            .register_offset => |reg_off| if (reg_off.reg.class() == rc and reg_off.off == 0) return false,
         };
+        const ty = temp.typeOf(cg);
         const new_temp_index = cg.next_temp_index;
         cg.temp_type[@intFromEnum(new_temp_index)] = ty;
         const new_reg = try cg.register_manager.allocReg(new_temp_index.toIndex(), regSetForRegClass(rc));
@@ -21281,15 +22317,11 @@ const Temp = struct {
         first_temp.* = result_temp;
     }
 
-    fn asMask(temp: Temp, kind: MaskKind, inverted: bool, scalar: Memory.Size, cg: *CodeGen) void {
-        assert(scalar != .none);
+    fn asMask(temp: Temp, info: MaskInfo, cg: *CodeGen) void {
+        assert(info.scalar != .none);
         const mcv = &temp.unwrap(cg).temp.tracking(cg).short;
-        mcv.* = .{ .register_mask = .{
-            .reg = mcv.register,
-            .kind = kind,
-            .inverted = inverted,
-            .scalar = scalar,
-        } };
+        const reg = mcv.register;
+        mcv.* = .{ .register_mask = .{ .reg = reg, .info = info } };
     }
 
     fn toLea(temp: *Temp, cg: *CodeGen) !bool {
@@ -21335,9 +22367,25 @@ const Temp = struct {
         }
     }
 
+    fn toMemory(temp: *Temp, cg: *CodeGen) !bool {
+        const temp_tracking = temp.tracking(cg);
+        if (temp_tracking.short.isMemory()) return false;
+        const new_temp_index = cg.next_temp_index;
+        const ty = temp.typeOf(cg);
+        cg.temp_type[@intFromEnum(new_temp_index)] = ty;
+        const new_frame_index = try cg.allocFrameIndex(.initSpill(ty, cg.pt.zcu));
+        try cg.genSetMem(.{ .frame = new_frame_index }, 0, ty, temp_tracking.short, .{});
+        new_temp_index.tracking(cg).* = .init(.{ .load_frame = .{ .index = new_frame_index } });
+        try temp.die(cg);
+        cg.next_temp_index = @enumFromInt(@intFromEnum(new_temp_index) + 1);
+        temp.* = .{ .index = new_temp_index.toIndex() };
+        return true;
+    }
+
     fn toBase(temp: *Temp, cg: *CodeGen) !bool {
         const temp_tracking = temp.tracking(cg);
         if (temp_tracking.short.isBase()) return false;
+        if (try temp.toMemory(cg)) return true;
         const new_temp_index = cg.next_temp_index;
         cg.temp_type[@intFromEnum(new_temp_index)] = temp.typeOf(cg);
         const new_reg =
@@ -21561,20 +22609,20 @@ fn tempAlloc(cg: *CodeGen, ty: Type) !Temp {
     return .{ .index = temp_index.toIndex() };
 }
 
-fn tempAllocReg(cg: *CodeGen, ty: Type, rc: RegisterManager.RegisterBitSet) !Temp {
+fn tempAllocReg(cg: *CodeGen, ty: Type, rs: RegisterManager.RegisterBitSet) !Temp {
     const temp_index = cg.next_temp_index;
     temp_index.tracking(cg).* = .init(
-        .{ .register = try cg.register_manager.allocReg(temp_index.toIndex(), rc) },
+        .{ .register = try cg.register_manager.allocReg(temp_index.toIndex(), rs) },
     );
     cg.temp_type[@intFromEnum(temp_index)] = ty;
     cg.next_temp_index = @enumFromInt(@intFromEnum(temp_index) + 1);
     return .{ .index = temp_index.toIndex() };
 }
 
-fn tempAllocRegPair(cg: *CodeGen, ty: Type, rc: RegisterManager.RegisterBitSet) !Temp {
+fn tempAllocRegPair(cg: *CodeGen, ty: Type, rs: RegisterManager.RegisterBitSet) !Temp {
     const temp_index = cg.next_temp_index;
     temp_index.tracking(cg).* = .init(
-        .{ .register_pair = try cg.register_manager.allocRegs(2, temp_index.toIndex(), rc) },
+        .{ .register_pair = try cg.register_manager.allocRegs(2, temp_index.toIndex(), rs) },
     );
     cg.temp_type[@intFromEnum(temp_index)] = ty;
     cg.next_temp_index = @enumFromInt(@intFromEnum(temp_index) + 1);
@@ -21696,18 +22744,21 @@ const Pattern = struct {
             once,
             /// execute the instruction on all groups of non-overlapping bits in the entire value
             bitwise,
-            /// for each element, execute the instruction on each limb, propogating the carry flag
+            /// for each element, execute the instruction on each limb, propagating the carry flag
             limbwise_carry,
+            /// for each element, execute the instruction on each limb, propagating a register
+            limbwise_reduce,
             /// for each element, execute the instruction on pairs of limbs, starting from the
-            /// least significant, propogating a limb
+            /// least significant, propagating a limb
             limbwise_pairs_forward,
             /// for each element, execute the instruction on pairs of limbs, starting from the
-            /// most significant, propogating a limb
+            /// most significant, propagating a limb
             limbwise_pairs_reverse,
             /// for each element, execute the instruction
             elementwise,
         } = .once,
         mir_tag: Mir.Inst.FixedTag,
+        final_mir_tag: ?Mir.Inst.FixedTag = null,
         patterns: []const Pattern,
     };
 
@@ -21740,7 +22791,7 @@ const Pattern = struct {
         ymm_sign_mask,
         /// any memory
         mem,
-        /// a limb stored in a gpr
+        /// a limb stored in a general purpose register
         gpr_limb,
         /// a limb stored in a 64-bit mmx register
         mm_limb,
@@ -21750,8 +22801,16 @@ const Pattern = struct {
         ymm_limb,
         /// a limb stored in memory
         mem_limb,
-        /// a limb stored in a condition code
-        cc_mask_limb,
+        /// a mutable limb stored in a general purpose register
+        mut_gpr_limb,
+        /// a mutable limb stored in memory
+        mut_mem_limb,
+        /// an element stored in a condition code
+        cc_elem,
+        /// an element stored in a general purpose register
+        gpr_elem,
+        /// an element stored in memory
+        mem_elem,
         /// a limb stored in a 64-bit mmx register mask
         mm_mask_limb,
         /// a limb stored in a 128-bit sse register masuk
@@ -21773,7 +22832,7 @@ const Pattern = struct {
 
         fn matches(op: Op, is_mut: bool, temp: Temp, cg: *CodeGen) bool {
             switch (op) {
-                .implicit, .explicit, .cc, .cc_mask_limb => unreachable,
+                .implicit, .explicit, .cc, .cc_elem => unreachable,
                 else => {},
                 // temp is undefined
                 .umax_gpr, .umax_mm, .umax_xmm, .umax_ymm => return true,
@@ -21781,7 +22840,7 @@ const Pattern = struct {
             const temp_ty = temp.typeOf(cg);
             const abi_size = temp_ty.abiSize(cg.pt.zcu);
             return switch (op) {
-                .implicit, .explicit, .cc, .cc_mask_limb, .umax_gpr, .umax_mm, .umax_xmm, .umax_ymm => unreachable,
+                .implicit, .explicit, .cc, .cc_elem, .umax_gpr, .umax_mm, .umax_xmm, .umax_ymm => unreachable,
                 .gpr => abi_size <= 8 and switch (temp.tracking(cg).short) {
                     .register => |reg| reg.class() == .general_purpose,
                     .register_offset => |reg_off| reg_off.reg.class() == .general_purpose and
@@ -21803,8 +22862,8 @@ const Pattern = struct {
                     .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
                     else => cg.regClassForType(temp_ty) == .sse,
                 },
-                .mem, .mem_limb => (!is_mut or temp.isMut(cg)) and temp.tracking(cg).short.isMemory(),
-                .gpr_limb => abi_size > 8 and switch (temp.tracking(cg).short) {
+                .mem, .mem_limb, .mut_mem_limb, .mem_elem => (!is_mut or temp.isMut(cg)) and temp.tracking(cg).short.isMemory(),
+                .gpr_limb, .mut_gpr_limb, .gpr_elem => abi_size > 8 and switch (temp.tracking(cg).short) {
                     .register, .register_pair, .register_triple, .register_quadruple => true,
                     else => |mcv| mcv.isMemory(),
                 },
@@ -21837,6 +22896,20 @@ const Pattern = struct {
             };
         }
     };
+
+    const Instruction = struct {
+        mir_tag: Mir.Inst.FixedTag,
+        operands: [4]Instruction.Operand,
+
+        const Operand = union(enum) {
+            regb: u8,
+            regw: u8,
+            ereg: u8,
+            rreg: u8,
+            xmm: u8,
+            ymm: u8,
+        };
+    };
 };
 const SelectOptions = struct {
     cc: ?Condition = null,
@@ -21932,6 +23005,7 @@ fn select(
             for (src_temps, pattern.ops[dst_temps.len..]) |src_temp, src_op| {
                 const ref_src_op, const is_mut = switch (src_op) {
                     .implicit, .explicit => |linked_index| .{ pattern.ops[linked_index], true },
+                    .mut_mem_limb, .mut_gpr_limb => .{ src_op, true },
                     else => .{ src_op, false },
                 };
                 if (!ref_src_op.matches(is_mut, src_temp, cg)) continue :patterns;
@@ -21939,7 +23013,7 @@ fn select(
 
             for (pattern.ops) |op| switch (op) {
                 else => {},
-                .cc_mask_limb,
+                .cc_elem,
                 .mm_mask_limb,
                 .xmm_mask_limb,
                 .ymm_mask_limb,
@@ -21953,16 +23027,16 @@ fn select(
                     .implicit, .explicit => |linked_index| pattern.ops[linked_index],
                     else => src_op,
                 }) {
-                    .implicit, .explicit, .cc, .cc_mask_limb => unreachable,
-                    .gpr => try src_temp.toRegClass(.general_purpose, cg),
-                    .mm, .mm_mask, .mm_sign_mask => try src_temp.toRegClass(.mmx, cg),
+                    .implicit, .explicit, .cc, .cc_elem => unreachable,
+                    .gpr => try src_temp.toRegClass(true, .general_purpose, cg),
+                    .mm, .mm_mask, .mm_sign_mask => try src_temp.toRegClass(true, .mmx, cg),
                     .xmm,
                     .ymm,
                     .xmm_mask,
                     .ymm_mask,
                     .xmm_sign_mask,
                     .ymm_sign_mask,
-                    => try src_temp.toRegClass(.sse, cg),
+                    => try src_temp.toRegClass(true, .sse, cg),
                     .mem => try src_temp.toBase(cg),
                     .imm, .simm32 => false,
                     .gpr_limb,
@@ -21970,6 +23044,10 @@ fn select(
                     .xmm_limb,
                     .ymm_limb,
                     .mem_limb,
+                    .mut_gpr_limb,
+                    .mut_mem_limb,
+                    .gpr_elem,
+                    .mem_elem,
                     => switch (src_temp.tracking(cg).short) {
                         .register, .register_pair, .register_triple, .register_quadruple => false,
                         else => try src_temp.toBase(cg),
@@ -21984,7 +23062,7 @@ fn select(
             var mir_ops_len: usize = 0;
             for (pattern.ops[0..dst_temps.len]) |dst_op| switch (dst_op) {
                 else => mir_ops_len += 1,
-                .cc, .cc_mask_limb => {},
+                .cc, .cc_elem => {},
             };
             const dst_mir_ops_len = mir_ops_len;
             for (src_temps, pattern.ops[dst_temps.len..]) |src_temp, src_op| {
@@ -22001,11 +23079,11 @@ fn select(
                 };
                 const limb_size: u8, const rc = switch (linked_src_op) {
                     else => continue,
-                    .gpr_limb => .{ @intCast(@divExact(@as(Memory.Size, switch (pattern_set.scalar) {
+                    .gpr_limb, .mut_gpr_limb, .gpr_elem => .{ @intCast(@divExact(Memory.Size.bitSize(switch (pattern_set.scalar) {
                         .any => .qword,
                         .bool => unreachable,
                         .float, .any_int, .signed_int, .unsigned_int, .any_float_or_int => |size| size,
-                    }).bitSize(), 8)), abi.RegisterClass.gp },
+                    }), 8)), abi.RegisterClass.gp },
                     .mm_limb, .mm_mask_limb => .{ 8, @panic("TODO") },
                     .xmm_limb, .xmm_mask_limb => .{ 16, abi.RegisterClass.sse },
                     .ymm_limb, .ymm_mask_limb => .{ 32, abi.RegisterClass.sse },
@@ -22030,7 +23108,10 @@ fn select(
                             .unused, .temp => loop.limb_offset = .{ .known = 0 },
                             .known => {},
                         }
-                        if (!rc.isSet(RegisterManager.indexOfRegIntoTracked(src_mcv.getRegs()[0]).?)) {
+                        if (switch (linked_src_op) {
+                            .mut_gpr_limb => true,
+                            else => !rc.isSet(RegisterManager.indexOfRegIntoTracked(src_mcv.getRegs()[0]).?),
+                        }) {
                             if (loop.shuffle_temp == null) loop.shuffle_temp = try cg.tempAllocReg(.noreturn, abi.RegisterClass.sse);
                             assert(extra_temp.* == null);
                             extra_temp.* = try cg.tempAllocReg(.usize, rc);
@@ -22066,25 +23147,26 @@ fn select(
                         .xmm, .xmm_mask, .xmm_sign_mask => try cg.tempAllocReg(dst_ty, abi.RegisterClass.sse),
                         .ymm, .ymm_mask, .ymm_sign_mask => try cg.tempAllocReg(dst_ty, abi.RegisterClass.sse),
                         .mem => @panic("TODO"),
-                        .gpr_limb, .mm_limb, .xmm_limb, .ymm_limb => {
+                        .gpr_limb, .mm_limb, .xmm_limb, .ymm_limb, .mut_gpr_limb, .gpr_elem => {
                             if (extra_temp.* == null) extra_temp.* = try cg.tempAllocReg(.noreturn, switch (dst_op) {
                                 else => unreachable,
-                                .gpr_limb => abi.RegisterClass.gp,
+                                .gpr_limb, .mut_gpr_limb, .gpr_elem => abi.RegisterClass.gp,
                                 .mm_limb => @panic("TODO"),
                                 .xmm_limb, .ymm_limb => abi.RegisterClass.sse,
                             });
                             break :dst_temp try cg.tempAlloc(dst_ty);
                         },
-                        .mem_limb => try cg.tempAlloc(dst_ty),
-                        .cc_mask_limb, .mm_mask_limb, .xmm_mask_limb, .ymm_mask_limb => unreachable, // already checked
+                        .mem_limb, .mut_mem_limb, .mem_elem => try cg.tempAlloc(dst_ty),
+                        .cc_elem, .mm_mask_limb, .xmm_mask_limb, .ymm_mask_limb => unreachable, // already checked
                         .imm, .simm32, .umax_gpr, .umax_mm, .umax_xmm, .umax_ymm => unreachable, // unmodifiable destination
                     };
                 },
-                .cc_mask_limb, .mm_mask_limb, .xmm_mask_limb, .ymm_mask_limb => {
-                    const scalar_size = @divExact(switch (pattern_set.scalar) {
-                        .any, .bool => unreachable,
+                .cc_elem, .mm_mask_limb, .xmm_mask_limb, .ymm_mask_limb => {
+                    const scalar_size = @divExact(Memory.Size.bitSize(switch (pattern_set.scalar) {
+                        .any => .qword,
+                        .bool => unreachable,
                         .float, .any_int, .signed_int, .unsigned_int, .any_float_or_int => |size| size,
-                    }.bitSize(), 8);
+                    }), 8);
                     const mask_bit_size = @divExact(loop.remaining_size.?, scalar_size);
                     const mask_limb_bit_size: u7 = @intCast(@divExact(loop.limb_size.?, scalar_size));
                     assert(loop.mask_limb_bit_size == null or loop.mask_limb_bit_size == mask_limb_bit_size);
@@ -22104,7 +23186,7 @@ fn select(
                     loop.mask_limb_offset = loop.limb_offset;
                     if (loop.mask_limb_temp == null) {
                         loop.mask_limb_temp = try cg.tempAllocReg(.usize, abi.RegisterClass.gp);
-                        if (dst_op == .cc_mask_limb and mask_store_bit_size > 8) {
+                        if (dst_op == .cc_elem and mask_store_bit_size > 8) {
                             // setcc only clears 8 bits
                             const mask_limb_alias = loop.mask_limb_temp.?.tracking(cg).short.register.to32();
                             try cg.spillEflagsIfOccupied();
@@ -22195,6 +23277,8 @@ fn select(
                             .mm_limb,
                             .xmm_limb,
                             .ymm_limb,
+                            .mut_gpr_limb,
+                            .gpr_elem,
                             .mm_mask_limb,
                             .xmm_mask_limb,
                             .ymm_mask_limb,
@@ -22304,7 +23388,7 @@ fn select(
                                 else => try cg.asmRegisterMemory(
                                     switch (linked_src_op) {
                                         else => unreachable,
-                                        .gpr_limb => .{ ._, .mov },
+                                        .gpr_limb, .mut_gpr_limb, .gpr_elem => .{ ._, .mov },
                                         .mm_limb, .mm_mask_limb => .{ ._q, .mov },
                                         .xmm_limb,
                                         .ymm_limb,
@@ -22328,7 +23412,7 @@ fn select(
                             },
                         }
                         mir_op.* = switch (linked_src_op) {
-                            .implicit, .explicit, .cc, .cc_mask_limb => unreachable,
+                            .implicit, .explicit, .cc, .cc_elem => unreachable,
                             .gpr => .{ .reg = registerAlias(
                                 src_mcv.register,
                                 @intCast(src_temp.typeOf(cg).abiSize(cg.pt.zcu)),
@@ -22342,6 +23426,8 @@ fn select(
                             .mm_limb,
                             .xmm_limb,
                             .ymm_limb,
+                            .mut_gpr_limb,
+                            .gpr_elem,
                             .mm_mask_limb,
                             .xmm_mask_limb,
                             .ymm_mask_limb,
@@ -22349,7 +23435,7 @@ fn select(
                                 limb_temp.tracking(cg).short.register
                             else
                                 src_mcv.getRegs()[@divExact(loop.limb_offset.known, loop.limb_size.?)], loop.limb_size.?) },
-                            .mem_limb => .{ .mem = switch (src_mcv) {
+                            .mem_limb, .mut_mem_limb, .mem_elem => .{ .mem = switch (src_mcv) {
                                 .register, .register_pair, .register_triple, .register_quadruple => unreachable,
                                 else => switch (loop.limb_offset) {
                                     .unused => unreachable,
@@ -22394,7 +23480,7 @@ fn select(
                     ) |*mir_op, dst_op, dst_temp, dst_ty, extra_temp| {
                         if (mir_op.* != .none) continue;
                         mir_op.* = switch (dst_op) {
-                            .implicit, .cc, .cc_mask_limb => unreachable,
+                            .implicit, .cc, .cc_elem => unreachable,
                             .explicit => |linked_index| mir_ops[linked_index],
                             .gpr => .{ .reg = registerAlias(
                                 dst_temp.tracking(cg).short.register,
@@ -22404,18 +23490,18 @@ fn select(
                             .xmm, .xmm_mask, .xmm_sign_mask => .{ .reg = dst_temp.tracking(cg).short.register.to128() },
                             .ymm, .ymm_mask, .ymm_sign_mask => .{ .reg = dst_temp.tracking(cg).short.register.to256() },
                             .mem => @panic("TODO"),
-                            .gpr_limb => .{ .reg = registerAlias(
+                            .gpr_limb, .mut_gpr_limb, .gpr_elem => .{ .reg = registerAlias(
                                 extra_temp.?.tracking(cg).short.register,
-                                @intCast(@divExact(@as(Memory.Size, switch (pattern_set.scalar) {
+                                @intCast(@divExact(Memory.Size.bitSize(switch (pattern_set.scalar) {
                                     .any => .qword,
                                     .bool => unreachable,
                                     .float, .any_int, .signed_int, .unsigned_int, .any_float_or_int => |size| size,
-                                }).bitSize(), 8)),
+                                }), 8)),
                             ) },
                             .mm_limb => .{ .reg = extra_temp.?.tracking(cg).short.register },
                             .xmm_limb => .{ .reg = extra_temp.?.tracking(cg).short.register.to128() },
                             .ymm_limb => .{ .reg = extra_temp.?.tracking(cg).short.register.to256() },
-                            .mem_limb => .{ .mem = try dst_temp.tracking(cg).short.mem(cg, switch (loop.limb_offset) {
+                            .mem_limb, .mut_mem_limb, .mem_elem => .{ .mem = try dst_temp.tracking(cg).short.mem(cg, switch (loop.limb_offset) {
                                 .unused => unreachable,
                                 .known => |limb_offset| .{
                                     .size = .fromSize(loop.limb_size.?),
@@ -22434,7 +23520,11 @@ fn select(
                     }
                     std.mem.swap(Operand, &mir_ops[pattern.commute[0]], &mir_ops[pattern.commute[1]]);
                     if (pattern_set.clobbers.eflags) try cg.spillEflagsIfOccupied();
-                    cg.asmOps(pattern_set.mir_tag, mir_ops) catch |err| switch (err) {
+                    cg.asmOps((if (loop.remaining_size != null and loop.limb_size != null and
+                        loop.remaining_size.? <= loop.limb_size.?)
+                        pattern_set.final_mir_tag
+                    else
+                        null) orelse pattern_set.mir_tag, mir_ops) catch |err| switch (err) {
                         error.InvalidInstruction => {
                             const fixes = @tagName(pattern_set.mir_tag[0]);
                             const fixes_blank = std.mem.indexOfScalar(u8, fixes, '_').?;
@@ -22478,15 +23568,19 @@ fn select(
                         .mm_sign_mask,
                         .xmm_sign_mask,
                         .ymm_sign_mask,
-                        => dst_temp.asMask(switch (dst_op) {
-                            else => unreachable,
-                            .mm_mask, .xmm_mask, .ymm_mask => .all,
-                            .mm_sign_mask, .xmm_sign_mask, .ymm_sign_mask => .sign,
-                        }, invert_result, switch (pattern_set.scalar) {
-                            .any, .bool => unreachable,
-                            .float, .any_int, .signed_int, .unsigned_int, .any_float_or_int => |size| size,
+                        => dst_temp.asMask(.{
+                            .kind = switch (dst_op) {
+                                else => unreachable,
+                                .mm_mask, .xmm_mask, .ymm_mask => .all,
+                                .mm_sign_mask, .xmm_sign_mask, .ymm_sign_mask => .sign,
+                            },
+                            .inverted = invert_result,
+                            .scalar = switch (pattern_set.scalar) {
+                                .any, .bool => unreachable,
+                                .float, .any_int, .signed_int, .unsigned_int, .any_float_or_int => |size| size,
+                            },
                         }, cg),
-                        .gpr_limb, .mm_limb, .xmm_limb, .ymm_limb => if (extra_temp) |limb_temp| {
+                        .gpr_limb, .mm_limb, .xmm_limb, .ymm_limb, .mut_gpr_limb, .gpr_elem => if (extra_temp) |limb_temp| {
                             const dst_mcv = dst_temp.tracking(cg).short;
                             switch (dst_mcv) {
                                 .register_pair, .register_triple, .register_quadruple => try cg.asmRegisterRegister(
@@ -22497,7 +23591,7 @@ fn select(
                                 else => try cg.asmMemoryRegister(
                                     switch (dst_op) {
                                         else => unreachable,
-                                        .gpr_limb => .{ ._, .mov },
+                                        .gpr_limb, .mut_gpr_limb, .gpr_elem => .{ ._, .mov },
                                         .mm_limb => .{ ._q, .mov },
                                         .xmm_limb, .ymm_limb => .{ if (cg.hasFeature(.avx)) .v_ else ._, .movdqu },
                                     },
@@ -22516,9 +23610,10 @@ fn select(
                                 ),
                             }
                         },
-                        .cc_mask_limb, .mm_mask_limb, .xmm_mask_limb, .ymm_mask_limb => {
+                        .cc_elem, .mm_mask_limb, .xmm_mask_limb, .ymm_mask_limb => {
                             const scalar_size = switch (pattern_set.scalar) {
-                                .any, .bool => unreachable,
+                                .any => .qword,
+                                .bool => unreachable,
                                 .float, .any_int, .signed_int, .unsigned_int, .any_float_or_int => |size| size,
                             };
                             const mask_store_size: u4 =
@@ -22534,7 +23629,7 @@ fn select(
                                 loop.mask_store_reg.?, mask_store_size);
                             switch (dst_op) {
                                 else => unreachable,
-                                .cc_mask_limb => try cg.asmSetccRegister(switch (invert_result) {
+                                .cc_elem => try cg.asmSetccRegister(switch (invert_result) {
                                     false => opts.cc.?,
                                     true => opts.cc.?.negate(),
                                 }, mask_limb_reg.to8()),
@@ -22678,6 +23773,7 @@ fn select(
                         .once => break :pattern_sets,
                         .bitwise => {},
                         .limbwise_carry => @panic("TODO"),
+                        .limbwise_reduce => @panic("TODO"),
                         .limbwise_pairs_forward => @panic("TODO"),
                         .limbwise_pairs_reverse => @panic("TODO"),
                         .elementwise => {},
@@ -22743,9 +23839,433 @@ fn select(
             }
         }
     } else {
-        log.err("failed to select:", .{});
+        log.err("failed to select {s}:", .{@tagName(pattern_sets[0].mir_tag[1])});
         for (src_temps) |src_temp| log.err("{}", .{src_temp.tracking(cg)});
-        return cg.fail("failed to select", .{});
+        return error.SelectFailed;
     }
     for (extra_temps) |extra_temp| if (extra_temp) |temp| try temp.die(cg);
 }
+
+const Select2 = struct {
+    cg: *CodeGen,
+    case: *const Case,
+    pattern: *const Select2.Pattern,
+    extra_temps: [3]Temp,
+    dst_temps: []const Temp,
+    src_temps: []const Temp,
+    commute: struct { u8, u8 },
+    limb: Memory.Mod.Rm,
+
+    fn emit(s: Select2, inst: Instruction) !void {
+        const mir_tag: Mir.Inst.FixedTag = .{ inst[0], inst[1] };
+        var mir_ops: [4]CodeGen.Operand = undefined;
+        inline for (&mir_ops, 2..) |*mir_op, inst_index| mir_op.* = try inst[inst_index].lower(s);
+        s.cg.asmOps(mir_tag, mir_ops) catch |err| switch (err) {
+            error.InvalidInstruction => {
+                const fixes = @tagName(mir_tag[0]);
+                const fixes_blank = std.mem.indexOfScalar(u8, fixes, '_').?;
+                return s.cg.fail(
+                    "invalid instruction: '{s}{s}{s} {s} {s} {s} {s}'",
+                    .{
+                        fixes[0..fixes_blank],
+                        @tagName(mir_tag[1]),
+                        fixes[fixes_blank + 1 ..],
+                        @tagName(mir_ops[0]),
+                        @tagName(mir_ops[1]),
+                        @tagName(mir_ops[2]),
+                        @tagName(mir_ops[3]),
+                    },
+                );
+            },
+            else => |e| return e,
+        };
+    }
+
+    fn lowerLimb(s: Select2, temp: Temp) !CodeGen.Operand {
+        return .{ .mem = try temp.tracking(s.cg).short.mem(s.cg, s.limb) };
+    }
+
+    fn srcTemp(s: Select2, index: u8) Temp {
+        return s.src_temps[
+            if (index == s.commute[0])
+                s.commute[1]
+            else if (index == s.commute[1])
+                s.commute[0]
+            else
+                index
+        ];
+    }
+
+    const Case = struct {
+        required_features: [2]?std.Target.x86.Feature = @splat(null),
+        constraints: [2]Constraint = @splat(.any),
+        patterns: []const Select2.Pattern,
+        clobbers: struct { eflags: bool = false } = .{},
+        extra_temps: [3]TempSpec = @splat(.unused),
+        dst_temps: [1]TempSpec.Kind = @splat(.unused),
+        each: union(enum) {
+            once: []const Instruction,
+            limb: struct {
+                of: Select2.Operand,
+                header: []const Instruction = &.{},
+                first: ?[]const Instruction = null,
+                body: []const Instruction,
+                last: ?[]const Instruction = null,
+                trailer: []const Instruction = &.{},
+            },
+        },
+    };
+
+    const Constraint = union(enum) {
+        any,
+        any_int,
+        any_float,
+        bool_vec: Memory.Size,
+        int: Memory.Size,
+        signed_int: Memory.Size,
+        unsigned_int: Memory.Size,
+
+        fn accepts(constraint: Constraint, temp: Temp, cg: *CodeGen) bool {
+            const zcu = cg.pt.zcu;
+            switch (constraint) {
+                .any => return true,
+                .any_int => {
+                    const scalar_ty = temp.typeOf(cg).scalarType(zcu);
+                    return scalar_ty.isAbiInt(zcu) or scalar_ty.isPtrAtRuntime(zcu);
+                },
+                .any_float => return temp.typeOf(cg).scalarType(zcu).isRuntimeFloat(),
+                .bool_vec => |size| {
+                    const ty = temp.typeOf(cg);
+                    return ty.isVector(zcu) and ty.scalarType(zcu).toIntern() == .bool_type and
+                        ty.vectorLen(zcu) <= size.bitSize();
+                },
+                .int => |size| {
+                    const scalar_ty = temp.typeOf(cg).scalarType(zcu);
+                    if (scalar_ty.isPtrAtRuntime(zcu)) return cg.target.ptrBitWidth() <= size.bitSize();
+                    return scalar_ty.isAbiInt(zcu) and scalar_ty.intInfo(zcu).bits <= size.bitSize();
+                },
+                .signed_int => |size| {
+                    const scalar_ty = temp.typeOf(cg).scalarType(zcu);
+                    if (!scalar_ty.isAbiInt(zcu)) return false;
+                    const info = scalar_ty.intInfo(zcu);
+                    return info.signedness == .signed and info.bits <= size.bitSize();
+                },
+                .unsigned_int => |size| {
+                    const scalar_ty = temp.typeOf(cg).scalarType(zcu);
+                    if (scalar_ty.isPtrAtRuntime(zcu)) return cg.target.ptrBitWidth() <= size.bitSize();
+                    if (!scalar_ty.isAbiInt(zcu)) return false;
+                    const info = scalar_ty.intInfo(zcu);
+                    return info.signedness == .unsigned and info.bits <= size.bitSize();
+                },
+            }
+        }
+    };
+
+    const Pattern = struct {
+        src: [2]Src,
+        commute: struct { u8, u8 } = .{ 0, 0 },
+
+        const Src = enum {
+            none,
+            any,
+            imm8,
+            imm16,
+            imm32,
+            simm32,
+            mem,
+            mut_mem,
+            to_mem,
+            gpr,
+            mut_gpr,
+            mm,
+            mut_mm,
+            xmm,
+            mut_xmm,
+            ymm,
+            mut_ymm,
+
+            fn matches(src: Src, temp: Temp, cg: *CodeGen) bool {
+                switch (src) {
+                    .none => unreachable,
+                    .any => return true,
+                    .imm8 => return switch (temp.tracking(cg).short) {
+                        .immediate => |imm| std.math.cast(u8, imm) != null,
+                        else => false,
+                    },
+                    .imm16 => return switch (temp.tracking(cg).short) {
+                        .immediate => |imm| std.math.cast(u16, imm) != null,
+                        else => false,
+                    },
+                    .imm32 => return switch (temp.tracking(cg).short) {
+                        .immediate => |imm| std.math.cast(u32, imm) != null,
+                        else => false,
+                    },
+                    .simm32 => return switch (temp.tracking(cg).short) {
+                        .immediate => |imm| std.math.cast(i32, @as(i64, @bitCast(imm))) != null,
+                        else => false,
+                    },
+                    .mem => return temp.tracking(cg).short.isMemory(),
+                    .mut_mem => return temp.isMut(cg) and temp.tracking(cg).short.isMemory(),
+                    .to_mem => return true,
+                    .gpr, .mut_gpr => {
+                        const mcv = temp.tracking(cg).short;
+                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
+                        return abi_size <= 8 and switch (mcv) {
+                            .register => |reg| reg.class() == .general_purpose,
+                            .register_offset => |reg_off| reg_off.reg.class() == .general_purpose and
+                                reg_off.off == 0,
+                            .register_pair, .register_triple, .register_quadruple => false,
+                            else => true,
+                        };
+                    },
+                    .mm, .mut_mm => {
+                        const mcv = temp.tracking(cg).short;
+                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
+                        return abi_size <= 8 and switch (mcv) {
+                            .register => |reg| reg.class() == .mmx,
+                            .register_offset => |reg_off| reg_off.reg.class() == .mmx and
+                                reg_off.off == 0,
+                            else => false,
+                        };
+                    },
+                    .xmm, .mut_xmm => {
+                        const mcv = temp.tracking(cg).short;
+                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
+                        return abi_size > 8 and abi_size <= 16 and switch (mcv) {
+                            .register => |reg| reg.class() == .sse,
+                            .register_offset => |reg_off| reg_off.reg.class() == .sse and
+                                reg_off.off == 0,
+                            .register_pair, .register_triple, .register_quadruple => false,
+                            else => true,
+                        };
+                    },
+                    .ymm, .mut_ymm => {
+                        const mcv = temp.tracking(cg).short;
+                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
+                        return abi_size > 16 and abi_size <= 32 and switch (mcv) {
+                            .register => |reg| reg.class() == .sse,
+                            .register_offset => |reg_off| reg_off.reg.class() == .sse and
+                                reg_off.off == 0,
+                            .register_pair, .register_triple, .register_quadruple => false,
+                            else => true,
+                        };
+                    },
+                }
+            }
+
+            fn convert(src: Src, temp: *Temp, cg: *CodeGen) !bool {
+                return switch (src) {
+                    .none => unreachable,
+                    .any, .imm8, .imm16, .imm32, .simm32 => false,
+                    .mem, .mut_mem, .to_mem => try temp.toBase(cg),
+                    .gpr => try temp.toRegClass(false, .general_purpose, cg),
+                    .mut_gpr => try temp.toRegClass(true, .general_purpose, cg),
+                    .mm => try temp.toRegClass(false, .mmx, cg),
+                    .mut_mm => try temp.toRegClass(true, .mmx, cg),
+                    .xmm, .ymm => try temp.toRegClass(false, .sse, cg),
+                    .mut_xmm, .mut_ymm => try temp.toRegClass(true, .sse, cg),
+                };
+            }
+        };
+    };
+
+    const TempSpec = struct {
+        type: Type = .noreturn,
+        kind: Kind,
+
+        const unused: TempSpec = .{ .kind = .unused };
+
+        const Kind = union(enum) {
+            unused,
+            any,
+            cc: Condition,
+            reg: Register,
+            rc: Register.Class,
+            rc_mask: struct { rc: Register.Class, info: MaskInfo },
+            mem,
+            src: u8,
+            src_mask: struct { src: u8, info: MaskInfo },
+
+            fn finish(kind: Kind, temp: Temp, s: Select2) void {
+                switch (kind) {
+                    else => {},
+                    inline .rc_mask, .src_mask => |mask| temp.asMask(mask.info, s.cg),
+                }
+            }
+        };
+
+        fn create(spec: TempSpec, s: Select2) !?Temp {
+            return switch (spec.kind) {
+                .unused => null,
+                .any => try s.cg.tempAlloc(spec.type),
+                .cc => |cc| try s.cg.tempFromValue(spec.type, .{ .eflags = cc }),
+                .reg => |reg| try s.cg.tempFromValue(spec.type, .{ .register = reg }),
+                .rc => |rc| try s.cg.tempAllocReg(spec.type, regSetForRegClass(rc)),
+                .rc_mask => |mask| try s.cg.tempAllocReg(spec.type, regSetForRegClass(mask.rc)),
+                .mem => try s.cg.tempAllocMem(spec.type),
+                .src => |src| s.srcTemp(src),
+                .src_mask => |mask| s.srcTemp(mask.src),
+            };
+        }
+    };
+
+    const Instruction = struct {
+        Mir.Inst.Fixes,
+        Mir.Inst.Tag,
+        Select2.Operand,
+        Select2.Operand,
+        Select2.Operand,
+        Select2.Operand,
+    };
+    const Operand = union(enum) {
+        none,
+        extra: struct { Memory.Size, u8 },
+        dst: struct { Memory.Size, u8 },
+        src: struct { Memory.Size, u8 },
+        dst_limb: u8,
+        src_limb: u8,
+        simm32: i32,
+
+        const tmp0b: Select2.Operand = .{ .extra = .{ .byte, 0 } };
+        const tmp0w: Select2.Operand = .{ .extra = .{ .word, 0 } };
+        const etmp0: Select2.Operand = .{ .extra = .{ .dword, 0 } };
+        const rtmp0: Select2.Operand = .{ .extra = .{ .qword, 0 } };
+        const xtmp0: Select2.Operand = .{ .extra = .{ .xword, 0 } };
+        const ytmp0: Select2.Operand = .{ .extra = .{ .yword, 0 } };
+
+        const tmp1b: Select2.Operand = .{ .extra = .{ .byte, 1 } };
+        const tmp1w: Select2.Operand = .{ .extra = .{ .word, 1 } };
+        const etmp1: Select2.Operand = .{ .extra = .{ .dword, 1 } };
+        const rtmp1: Select2.Operand = .{ .extra = .{ .qword, 1 } };
+        const xtmp1: Select2.Operand = .{ .extra = .{ .xword, 1 } };
+        const ytmp1: Select2.Operand = .{ .extra = .{ .yword, 1 } };
+
+        const tmp2b: Select2.Operand = .{ .extra = .{ .byte, 2 } };
+        const tmp2w: Select2.Operand = .{ .extra = .{ .word, 2 } };
+        const etmp2: Select2.Operand = .{ .extra = .{ .dword, 2 } };
+        const rtmp2: Select2.Operand = .{ .extra = .{ .qword, 2 } };
+        const xtmp2: Select2.Operand = .{ .extra = .{ .xword, 2 } };
+        const ytmp2: Select2.Operand = .{ .extra = .{ .yword, 2 } };
+
+        const dst0b: Select2.Operand = .{ .dst = .{ .byte, 0 } };
+        const dst0w: Select2.Operand = .{ .dst = .{ .word, 0 } };
+        const edst0: Select2.Operand = .{ .dst = .{ .dword, 0 } };
+        const rdst0: Select2.Operand = .{ .dst = .{ .qword, 0 } };
+        const xdst0: Select2.Operand = .{ .dst = .{ .xword, 0 } };
+        const ydst0: Select2.Operand = .{ .dst = .{ .yword, 0 } };
+
+        const src0b: Select2.Operand = .{ .src = .{ .byte, 0 } };
+        const src0w: Select2.Operand = .{ .src = .{ .word, 0 } };
+        const esrc0: Select2.Operand = .{ .src = .{ .dword, 0 } };
+        const rsrc0: Select2.Operand = .{ .src = .{ .qword, 0 } };
+        const xsrc0: Select2.Operand = .{ .src = .{ .xword, 0 } };
+        const ysrc0: Select2.Operand = .{ .src = .{ .yword, 0 } };
+
+        const src1b: Select2.Operand = .{ .src = .{ .byte, 1 } };
+        const src1w: Select2.Operand = .{ .src = .{ .word, 1 } };
+        const esrc1: Select2.Operand = .{ .src = .{ .dword, 1 } };
+        const rsrc1: Select2.Operand = .{ .src = .{ .qword, 1 } };
+        const xsrc1: Select2.Operand = .{ .src = .{ .xword, 1 } };
+        const ysrc1: Select2.Operand = .{ .src = .{ .yword, 1 } };
+
+        fn unwrap(op: Select2.Operand, s: Select2) struct { Memory.Size, Temp } {
+            return switch (op) {
+                else => unreachable,
+                .extra => |extra| .{ extra[0], s.extra_temps[extra[1]] },
+                .dst => |dst| .{ dst[0], s.dst_temps[dst[1]] },
+                .src => |src| .{ src[0], s.srcTemp(src[1]) },
+            };
+        }
+
+        fn lower(op: Select2.Operand, s: Select2) !CodeGen.Operand {
+            switch (op) {
+                .none => return .none,
+                else => {},
+                .dst_limb => |dst| return s.lowerLimb(s.dst_temps[dst]),
+                .src_limb => |src| return s.lowerLimb(s.srcTemp(src)),
+                .simm32 => |imm| return .{ .imm = .s(imm) },
+            }
+            const size, const temp = op.unwrap(s);
+            return switch (temp.tracking(s.cg).short) {
+                .immediate => |imm| .{ .imm = switch (size) {
+                    .byte => if (std.math.cast(i8, @as(i64, @bitCast(imm)))) |simm| .s(simm) else .u(@as(u8, @intCast(imm))),
+                    .word => if (std.math.cast(i16, @as(i64, @bitCast(imm)))) |simm| .s(simm) else .u(@as(u16, @intCast(imm))),
+                    .dword => if (std.math.cast(i32, @as(i64, @bitCast(imm)))) |simm| .s(simm) else .u(@as(u32, @intCast(imm))),
+                    .qword => if (std.math.cast(i32, @as(i64, @bitCast(imm)))) |simm| .s(simm) else .u(imm),
+                    else => unreachable,
+                } },
+                else => |mcv| .{ .mem = try mcv.mem(s.cg, .{ .size = size }) },
+                .register => |reg| .{ .reg = registerAlias(reg, @intCast(@divExact(size.bitSize(), 8))) },
+            };
+        }
+    };
+};
+fn select2(
+    cg: *CodeGen,
+    dst_temps: []Temp,
+    dst_tys: []const Type,
+    src_temps: []Temp,
+    cases: []const Select2.Case,
+) !void {
+    cases: for (cases) |*case| {
+        for (case.required_features) |required_feature| if (required_feature) |feature| if (!switch (feature) {
+            .@"64bit" => cg.target.cpu.arch == .x86_64,
+            .mmx => false,
+            else => cg.hasFeature(feature),
+        }) continue :cases;
+        for (case.constraints[0..src_temps.len], src_temps) |src_constraint, src_temp| if (!src_constraint.accepts(src_temp, cg)) continue :cases;
+        patterns: for (case.patterns) |*pattern| {
+            for (pattern.src, src_temps) |src_pattern, src_temp| if (!src_pattern.matches(src_temp, cg)) continue :patterns;
+
+            var s: Select2 = .{
+                .cg = cg,
+                .case = case,
+                .pattern = pattern,
+                .extra_temps = undefined,
+                .dst_temps = dst_temps,
+                .src_temps = src_temps,
+                .commute = pattern.commute,
+                .limb = undefined,
+            };
+            for (&s.extra_temps, case.extra_temps) |*temp, spec| temp.* = try spec.create(s) orelse continue;
+
+            while (true) for (pattern.src, src_temps) |src_pattern, *src_temp| {
+                if (try src_pattern.convert(src_temp, cg)) break;
+            } else break;
+
+            if (case.clobbers.eflags or case.each != .once) try cg.spillEflagsIfOccupied();
+
+            for (dst_temps, dst_tys, case.dst_temps[0..dst_temps.len]) |*dst_temp, dst_ty, dst_kind|
+                dst_temp.* = (try Select2.TempSpec.create(.{ .type = dst_ty, .kind = dst_kind }, s)).?;
+
+            switch (case.each) {
+                .once => |body| for (body) |inst| try s.emit(inst),
+                .limb => |limb| {
+                    const limb_size, const limb_of_temp = limb.of.unwrap(s);
+                    const limb_of_size: u31 = @intCast(limb_of_temp.typeOf(cg).abiSize(cg.pt.zcu));
+                    s.limb = .{
+                        .size = limb_size,
+                        .index = s.extra_temps[0].tracking(cg).short.register.to64(),
+                        .disp = limb_of_size,
+                    };
+                    for (limb.header) |inst| try s.emit(inst);
+                    try cg.asmRegisterImmediate(.{ ._, .mov }, s.limb.index, .s(-@as(i32, limb_of_size)));
+                    const limb_loop_reloc: u32 = @intCast(cg.mir_instructions.len);
+                    for (limb.body) |inst| try s.emit(inst);
+                    try cg.asmRegisterImmediate(
+                        .{ ._, .add },
+                        s.limb.index,
+                        .s(@intCast(@divExact(limb_size.bitSize(), 8))),
+                    );
+                    _ = try cg.asmJccReloc(.nc, limb_loop_reloc);
+                    for (limb.trailer) |inst| try s.emit(inst);
+                },
+            }
+
+            for (dst_temps, case.dst_temps[0..dst_temps.len]) |dst_temp, dst_kind| dst_kind.finish(dst_temp, s);
+            for (case.extra_temps, s.extra_temps) |spec, temp| if (spec.kind != .unused) try temp.die(cg);
+            return;
+        }
+    }
+    return error.Select2Failed;
+}
src/arch/x86_64/Encoding.zig
@@ -353,6 +353,7 @@ pub const Mnemonic = enum {
     pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
     pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq,
     pmulld,
+    ptest,
     roundpd, roundps, roundsd, roundss,
     // SSE4.2
     pcmpgtq,
@@ -413,6 +414,7 @@ pub const Mnemonic = enum {
     vpsrad, vpsraq, vpsraw,
     vpsrld, vpsrldq, vpsrlq, vpsrlw,
     vpsubb, vpsubd, vpsubq, vpsubsb, vpsubsw, vpsubusb, vpsubusw, vpsubw,
+    vptest,
     vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
     vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd,
     vpxor,
@@ -421,6 +423,7 @@ pub const Mnemonic = enum {
     vsqrtpd, vsqrtps, vsqrtsd, vsqrtss,
     vstmxcsr,
     vsubpd, vsubps, vsubsd, vsubss,
+    vtestpd, vtestps,
     vxorpd, vxorps,
     // F16C
     vcvtph2ps, vcvtps2ph,
src/arch/x86_64/encodings.zig
@@ -1251,6 +1251,8 @@ pub const table = [_]Entry{
 
     .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 },
 
+    .{ .ptest, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x17 }, 0, .none, .sse4_1 },
+
     .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
 
     .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 },
@@ -1676,6 +1678,9 @@ pub const table = [_]Entry{
     .{ .vpsubusb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .vex_128_wig, .avx },
     .{ .vpsubusw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .vex_128_wig, .avx },
 
+    .{ .vptest, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x17 }, 0, .vex_128_wig, .avx },
+    .{ .vptest, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x17 }, 0, .vex_256_wig, .avx },
+
     .{ .vpunpckhbw,  .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_128_wig, .avx },
     .{ .vpunpckhwd,  .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_128_wig, .avx },
     .{ .vpunpckhdq,  .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_128_wig, .avx },
@@ -1726,6 +1731,11 @@ pub const table = [_]Entry{
 
     .{ .vsubss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx },
 
+    .{ .vtestps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0e }, 0, .vex_128_w0, .avx },
+    .{ .vtestps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0e }, 0, .vex_256_w0, .avx },
+    .{ .vtestpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0f }, 0, .vex_128_w0, .avx },
+    .{ .vtestpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0f }, 0, .vex_256_w0, .avx },
+
     .{ .vxorpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .vex_128_wig, .avx },
     .{ .vxorpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x57 }, 0, .vex_256_wig, .avx },
 
test/behavior/x86_64/math.zig
@@ -263,6 +263,12 @@ fn testBinary(comptime op: anytype) !void {
         0xbfd88aee1d82ed32, 0x20e91c15b701059a,
         0xed533d18f8657f3f, 0x1ddd7cd7f6bab957,
     });
+
+    if (false) try testType(@Vector(1, u128), .{
+        0x5f11e16b0ca3392f907a857881455d2e,
+    }, .{
+        0xf9142d73b408fd6955922f9fc147f7d7,
+    });
 }
 
 inline fn bitAnd(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs & rhs) {