Commit ac1a975f9b

Jacob Young <jacobly0@users.noreply.github.com>
2024-12-29 04:51:41
x86_64: implement clz and not
1 parent a7efc56
lib/std/math/big/int.zig
@@ -2520,12 +2520,13 @@ pub const Const = struct {
         return order(a, b) == .eq;
     }
 
+    /// Returns the number of leading zeros in twos-complement form.
     pub fn clz(a: Const, bits: Limb) Limb {
-        // Limbs are stored in little-endian order but we need
-        // to iterate big-endian.
+        // Limbs are stored in little-endian order but we need to iterate big-endian.
+        if (!a.positive) return 0;
         var total_limb_lz: Limb = 0;
         var i: usize = a.limbs.len;
-        const bits_per_limb = @sizeOf(Limb) * 8;
+        const bits_per_limb = @bitSizeOf(Limb);
         while (i != 0) {
             i -= 1;
             const limb = a.limbs[i];
@@ -2537,13 +2538,15 @@ pub const Const = struct {
         return total_limb_lz + bits - total_limb_bits;
     }
 
+    /// Returns the number of trailing zeros in twos-complement form.
     pub fn ctz(a: Const, bits: Limb) Limb {
-        // Limbs are stored in little-endian order.
+        // Limbs are stored in little-endian order. Converting a negative number to twos-complement
+        // flips all bits above the lowest set bit, which does not affect the trailing zero count.
         var result: Limb = 0;
         for (a.limbs) |limb| {
             const limb_tz = @ctz(limb);
             result += limb_tz;
-            if (limb_tz != @sizeOf(Limb) * 8) break;
+            if (limb_tz != @bitSizeOf(Limb)) break;
         }
         return @min(result, bits);
     }
lib/std/Target/Query.zig
@@ -6,13 +6,13 @@
 /// `null` means native.
 cpu_arch: ?Target.Cpu.Arch = null,
 
-cpu_model: CpuModel = CpuModel.determined_by_arch_os,
+cpu_model: CpuModel = .determined_by_arch_os,
 
 /// Sparse set of CPU features to add to the set from `cpu_model`.
-cpu_features_add: Target.Cpu.Feature.Set = Target.Cpu.Feature.Set.empty,
+cpu_features_add: Target.Cpu.Feature.Set = .empty,
 
 /// Sparse set of CPU features to remove from the set from `cpu_model`.
-cpu_features_sub: Target.Cpu.Feature.Set = Target.Cpu.Feature.Set.empty,
+cpu_features_sub: Target.Cpu.Feature.Set = .empty,
 
 /// `null` means native.
 os_tag: ?Target.Os.Tag = null,
@@ -38,7 +38,7 @@ abi: ?Target.Abi = null,
 
 /// When `os_tag` is `null`, then `null` means native. Otherwise it means the standard path
 /// based on the `os_tag`.
-dynamic_linker: Target.DynamicLinker = Target.DynamicLinker.none,
+dynamic_linker: Target.DynamicLinker = .none,
 
 /// `null` means default for the cpu/arch/os combo.
 ofmt: ?Target.ObjectFormat = null,
lib/std/Target/x86.zig
@@ -47,6 +47,7 @@ pub const Feature = enum {
     bmi2,
     branch_hint,
     branchfusion,
+    bsf_bsr_0_clobbers_result,
     ccmp,
     cf,
     cldemote,
@@ -167,6 +168,8 @@ pub const Feature = enum {
     slow_unaligned_mem_32,
     sm3,
     sm4,
+    smap,
+    smep,
     soft_float,
     sse,
     sse2,
@@ -497,6 +500,11 @@ pub const all_features = blk: {
         .description = "CMP/TEST can be fused with conditional branches",
         .dependencies = featureSet(&[_]Feature{}),
     };
+    result[@intFromEnum(Feature.bsf_bsr_0_clobbers_result)] = .{
+        .llvm_name = null,
+        .description = "BSF/BSR may clobber the lower 32-bits of the result register when the source is zero",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@intFromEnum(Feature.ccmp)] = .{
         .llvm_name = "ccmp",
         .description = "Support conditional cmp & test instructions",
@@ -1127,6 +1135,16 @@ pub const all_features = blk: {
             .avx2,
         }),
     };
+    result[@intFromEnum(Feature.smap)] = .{
+        .llvm_name = null,
+        .description = "Enable Supervisor Mode Access Prevention",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
+    result[@intFromEnum(Feature.smep)] = .{
+        .llvm_name = null,
+        .description = "Enable Supervisor Mode Execution Prevention",
+        .dependencies = featureSet(&[_]Feature{}),
+    };
     result[@intFromEnum(Feature.soft_float)] = .{
         .llvm_name = "soft-float",
         .description = "Use software floating point features",
@@ -1371,6 +1389,8 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vaes,
             .vpclmulqdq,
@@ -1467,6 +1487,8 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .uintr,
             .vaes,
@@ -1545,6 +1567,8 @@ pub const cpu = struct {
             .slow_3ops_lea,
             .sm3,
             .sm4,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .uintr,
             .vaes,
@@ -1783,6 +1807,8 @@ pub const cpu = struct {
             .sahf,
             .sbb_dep_breaking,
             .slow_shld,
+            .smap,
+            .smep,
             .sse4a,
             .vzeroupper,
             .x87,
@@ -1995,6 +2021,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .vzeroupper,
             .x87,
             .xsaveopt,
@@ -2136,6 +2164,8 @@ pub const cpu = struct {
             .sahf,
             .sha,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vzeroupper,
             .x87,
@@ -2195,6 +2225,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vzeroupper,
             .x87,
@@ -2450,6 +2482,8 @@ pub const cpu = struct {
             .serialize,
             .sha,
             .shstk,
+            .smap,
+            .smep,
             .tsxldtrk,
             .tuning_fast_imm_vector_shift,
             .uintr,
@@ -2519,6 +2553,8 @@ pub const cpu = struct {
             .slow_incdec,
             .slow_lea,
             .slow_two_mem_ops,
+            .smap,
+            .smep,
             .sse4_2,
             .use_glm_div_sqrt_costs,
             .vzeroupper,
@@ -2898,6 +2934,7 @@ pub const cpu = struct {
             .rdrnd,
             .sahf,
             .slow_3ops_lea,
+            .smep,
             .vzeroupper,
             .x87,
             .xsaveopt,
@@ -2907,6 +2944,7 @@ pub const cpu = struct {
         .name = "i386",
         .llvm_name = "i386",
         .features = featureSet(&[_]Feature{
+            .bsf_bsr_0_clobbers_result,
             .slow_unaligned_mem_16,
             .vzeroupper,
             .x87,
@@ -2916,6 +2954,7 @@ pub const cpu = struct {
         .name = "i486",
         .llvm_name = "i486",
         .features = featureSet(&[_]Feature{
+            .bsf_bsr_0_clobbers_result,
             .slow_unaligned_mem_16,
             .vzeroupper,
             .x87,
@@ -3096,6 +3135,7 @@ pub const cpu = struct {
             .sahf,
             .slow_3ops_lea,
             .slow_unaligned_mem_32,
+            .smep,
             .vzeroupper,
             .x87,
             .xsaveopt,
@@ -3403,6 +3443,8 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vaes,
             .vpclmulqdq,
@@ -3766,6 +3808,8 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vaes,
             .vpclmulqdq,
@@ -3831,6 +3875,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .sha,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vaes,
             .vpclmulqdq,
@@ -3939,6 +3985,8 @@ pub const cpu = struct {
             .serialize,
             .sha,
             .shstk,
+            .smap,
+            .smep,
             .tsxldtrk,
             .tuning_fast_imm_vector_shift,
             .uintr,
@@ -4042,6 +4090,7 @@ pub const cpu = struct {
             .slow_lea,
             .slow_pmulld,
             .slow_two_mem_ops,
+            .smep,
             .sse4_2,
             .use_slm_arith_costs,
             .vzeroupper,
@@ -4098,6 +4147,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vzeroupper,
             .x87,
@@ -4150,6 +4201,8 @@ pub const cpu = struct {
             .rdseed,
             .sahf,
             .slow_3ops_lea,
+            .smap,
+            .smep,
             .vzeroupper,
             .x87,
             .xsavec,
@@ -4305,6 +4358,8 @@ pub const cpu = struct {
             .sahf,
             .sha,
             .shstk,
+            .smap,
+            .smep,
             .tuning_fast_imm_vector_shift,
             .vaes,
             .vpclmulqdq,
@@ -4574,6 +4629,8 @@ pub const cpu = struct {
             .sbb_dep_breaking,
             .sha,
             .slow_shld,
+            .smap,
+            .smep,
             .sse4a,
             .vzeroupper,
             .x87,
@@ -4629,6 +4686,8 @@ pub const cpu = struct {
             .sbb_dep_breaking,
             .sha,
             .slow_shld,
+            .smap,
+            .smep,
             .sse4a,
             .vzeroupper,
             .wbnoinvd,
@@ -4686,6 +4745,8 @@ pub const cpu = struct {
             .sbb_dep_breaking,
             .sha,
             .slow_shld,
+            .smap,
+            .smep,
             .sse4a,
             .vaes,
             .vpclmulqdq,
@@ -4757,6 +4818,8 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_shld,
+            .smap,
+            .smep,
             .sse4a,
             .vaes,
             .vpclmulqdq,
@@ -4833,6 +4896,8 @@ pub const cpu = struct {
             .sha,
             .shstk,
             .slow_shld,
+            .smap,
+            .smep,
             .sse4a,
             .vaes,
             .vpclmulqdq,
lib/std/zig/system/x86.zig
@@ -369,6 +369,7 @@ fn detectNativeFeatures(cpu: *Target.Cpu, os_tag: Target.Os.Tag) void {
         setFeature(cpu, .bmi, bit(leaf.ebx, 3));
         // AVX2 is only supported if we have the OS save support from AVX.
         setFeature(cpu, .avx2, bit(leaf.ebx, 5) and has_avx_save);
+        setFeature(cpu, .smep, bit(leaf.ebx, 7));
         setFeature(cpu, .bmi2, bit(leaf.ebx, 8));
         setFeature(cpu, .invpcid, bit(leaf.ebx, 10));
         setFeature(cpu, .rtm, bit(leaf.ebx, 11));
@@ -377,6 +378,7 @@ fn detectNativeFeatures(cpu: *Target.Cpu, os_tag: Target.Os.Tag) void {
         setFeature(cpu, .avx512dq, bit(leaf.ebx, 17) and has_avx512_save);
         setFeature(cpu, .rdseed, bit(leaf.ebx, 18));
         setFeature(cpu, .adx, bit(leaf.ebx, 19));
+        setFeature(cpu, .smap, bit(leaf.ebx, 20));
         setFeature(cpu, .avx512ifma, bit(leaf.ebx, 21) and has_avx512_save);
         setFeature(cpu, .clflushopt, bit(leaf.ebx, 23));
         setFeature(cpu, .clwb, bit(leaf.ebx, 24));
src/arch/x86_64/bits.zig
@@ -571,11 +571,15 @@ pub const Memory = struct {
             writer: anytype,
         ) @TypeOf(writer).Error!void {
             if (s == .none) return;
-            if (s != .ptr) {
-                try writer.writeAll(@tagName(s));
-                try writer.writeByte(' ');
+            try writer.writeAll(@tagName(s));
+            switch (s) {
+                .none => unreachable,
+                .ptr => {},
+                else => {
+                    try writer.writeByte(' ');
+                    try writer.writeAll("ptr");
+                },
             }
-            try writer.writeAll("ptr");
         }
     };
 
src/arch/x86_64/CodeGen.zig
@@ -1390,7 +1390,7 @@ fn asmOps(self: *CodeGen, tag: Mir.Inst.FixedTag, ops: [4]Operand) !void {
 
 /// A `cc` of `.z_and_np` clobbers `reg2`!
 fn asmCmovccRegisterRegister(self: *CodeGen, cc: Condition, reg1: Register, reg2: Register) !void {
-    _ = try self.addInst(.{
+    if (self.hasFeature(.cmov)) _ = try self.addInst(.{
         .tag = switch (cc) {
             else => .cmov,
             .z_and_np, .nz_or_p => .pseudo,
@@ -1408,12 +1408,16 @@ fn asmCmovccRegisterRegister(self: *CodeGen, cc: Condition, reg1: Register, reg2
             .r1 = reg1,
             .r2 = reg2,
         } },
-    });
+    }) else {
+        const reloc = try self.asmJccReloc(cc.negate(), undefined);
+        try self.asmRegisterRegister(.{ ._, .mov }, reg1, reg2);
+        self.performReloc(reloc);
+    }
 }
 
 /// A `cc` of `.z_and_np` is not supported by this encoding!
 fn asmCmovccRegisterMemory(self: *CodeGen, cc: Condition, reg: Register, m: Memory) !void {
-    _ = try self.addInst(.{
+    if (self.hasFeature(.cmov)) _ = try self.addInst(.{
         .tag = switch (cc) {
             else => .cmov,
             .z_and_np => unreachable,
@@ -1433,7 +1437,11 @@ fn asmCmovccRegisterMemory(self: *CodeGen, cc: Condition, reg: Register, m: Memo
             .r1 = reg,
             .payload = try self.addExtra(Mir.Memory.encode(m)),
         } },
-    });
+    }) else {
+        const reloc = try self.asmJccReloc(cc.negate(), undefined);
+        try self.asmRegisterMemory(.{ ._, .mov }, reg, m);
+        self.performReloc(reloc);
+    }
 }
 
 fn asmSetccRegister(self: *CodeGen, cc: Condition, reg: Register) !void {
@@ -2319,6 +2327,7 @@ fn genBodyBlock(self: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 }
 
 fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
+    @setEvalBranchQuota(1_600);
     const pt = cg.pt;
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
@@ -2354,9 +2363,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
         try cg.inst_tracking.ensureUnusedCapacity(cg.gpa, 1);
         switch (air_tags[@intFromEnum(inst)]) {
             // zig fmt: off
-            .not,
-            => |air_tag| try cg.airUnOp(inst, air_tag),
-
             .add,
             .add_wrap,
             .sub,
@@ -2434,7 +2440,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
             .memset_safe      => try cg.airMemset(inst, true),
             .set_union_tag    => try cg.airSetUnionTag(inst),
             .get_union_tag    => try cg.airGetUnionTag(inst),
-            .clz              => try cg.airClz(inst),
             .ctz              => try cg.airCtz(inst),
             .popcount         => try cg.airPopCount(inst),
             .byte_swap        => try cg.airByteSwap(inst),
@@ -2525,146 +2530,156 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 })) {
                     else => unreachable,
                     inline .@"and", .@"or", .xor => |mir_tag| comptime &.{ .{
-                        .required_features = .{ .avx2, null },
+                        .required_features = .{ .avx2, null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .yword }, .{ .int_or_vec = .yword } },
                         .patterns = &.{
-                            .{ .src = .{ .ymm, .mem } },
-                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .ymm, .ymm } },
+                            .{ .src = .{ .to_ymm, .mem } },
+                            .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_ymm, .to_ymm } },
                         },
                         .dst_temps = .{.{ .rc = .sse }},
                         .each = .{ .once = &.{
                             .{ ._, .vp_, mir_tag, .dst0y, .src0y, .src1y, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .yword }, .{ .int_or_vec = .yword } },
                         .patterns = &.{
-                            .{ .src = .{ .ymm, .mem } },
-                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .ymm, .ymm } },
+                            .{ .src = .{ .to_ymm, .mem } },
+                            .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_ymm, .to_ymm } },
                         },
                         .dst_temps = .{.{ .rc = .sse }},
                         .each = .{ .once = &.{
                             .{ ._, .v_pd, mir_tag, .dst0y, .src0y, .src1y, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .xword }, .{ .int_or_vec = .xword } },
                         .patterns = &.{
-                            .{ .src = .{ .xmm, .mem } },
-                            .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .xmm, .xmm } },
+                            .{ .src = .{ .to_xmm, .mem } },
+                            .{ .src = .{ .mem, .to_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_xmm, .to_xmm } },
                         },
                         .dst_temps = .{.{ .rc = .sse }},
                         .each = .{ .once = &.{
                             .{ ._, .vp_, mir_tag, .dst0x, .src0x, .src1x, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse2, null },
+                        .required_features = .{ .sse2, null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .xword }, .{ .int_or_vec = .xword } },
                         .patterns = &.{
-                            .{ .src = .{ .mut_xmm, .mem } },
-                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_xmm, .xmm } },
+                            .{ .src = .{ .to_mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_xmm, .to_xmm } },
                         },
                         .dst_temps = .{.{ .ref = .src0 }},
                         .each = .{ .once = &.{
                             .{ ._, .p_, mir_tag, .dst0x, .src1x, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse, null },
+                        .required_features = .{ .sse, null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .xword }, .{ .int_or_vec = .xword } },
                         .patterns = &.{
-                            .{ .src = .{ .mut_xmm, .mem } },
-                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_xmm, .xmm } },
+                            .{ .src = .{ .to_mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_xmm, .to_xmm } },
                         },
                         .dst_temps = .{.{ .ref = .src0 }},
                         .each = .{ .once = &.{
                             .{ ._, ._ps, mir_tag, .dst0x, .src1x, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .mmx, null },
+                        .required_features = .{ .mmx, null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .qword }, .{ .int_or_vec = .qword } },
                         .patterns = &.{
-                            .{ .src = .{ .mut_mm, .mem } },
-                            .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_mm, .mm } },
+                            .{ .src = .{ .to_mut_mm, .mem } },
+                            .{ .src = .{ .mem, .to_mut_mm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_mm, .to_mm } },
                         },
                         .dst_temps = .{.{ .ref = .src0 }},
                         .each = .{ .once = &.{
                             .{ ._, .p_, mir_tag, .dst0q, .src1q, ._, ._ },
                         } },
                     }, .{
-                        .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                        .src_constraints = .{ .{ .int_or_vec = .byte }, .{ .int_or_vec = .byte } },
                         .patterns = &.{
                             .{ .src = .{ .mut_mem, .imm8 } },
                             .{ .src = .{ .imm8, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .imm8 } },
-                            .{ .src = .{ .imm8, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_mem, .gpr } },
-                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .mem } },
-                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .gpr } },
+                            .{ .src = .{ .to_mut_gpr, .imm8 } },
+                            .{ .src = .{ .imm8, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .to_gpr } },
+                            .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .ref = .src0 }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, mir_tag, .dst0b, .src1b, ._, ._ },
                         } },
                     }, .{
-                        .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                        .src_constraints = .{ .{ .int_or_vec = .word }, .{ .int_or_vec = .word } },
                         .patterns = &.{
                             .{ .src = .{ .mut_mem, .imm16 } },
                             .{ .src = .{ .imm16, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .imm16 } },
-                            .{ .src = .{ .imm16, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_mem, .gpr } },
-                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .mem } },
-                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .gpr } },
+                            .{ .src = .{ .to_mut_gpr, .imm16 } },
+                            .{ .src = .{ .imm16, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .to_gpr } },
+                            .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .ref = .src0 }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, mir_tag, .dst0w, .src1w, ._, ._ },
                         } },
                     }, .{
-                        .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                        .src_constraints = .{ .{ .int_or_vec = .dword }, .{ .int_or_vec = .dword } },
                         .patterns = &.{
                             .{ .src = .{ .mut_mem, .imm32 } },
                             .{ .src = .{ .imm32, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .imm32 } },
-                            .{ .src = .{ .imm32, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_mem, .gpr } },
-                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .mem } },
-                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .gpr } },
+                            .{ .src = .{ .to_mut_gpr, .imm32 } },
+                            .{ .src = .{ .imm32, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .to_gpr } },
+                            .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .ref = .src0 }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, mir_tag, .dst0d, .src1d, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .@"64bit", null },
-                        .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                        .required_features = .{ .@"64bit", null, null, null },
+                        .src_constraints = .{ .{ .int_or_vec = .qword }, .{ .int_or_vec = .qword } },
                         .patterns = &.{
                             .{ .src = .{ .mut_mem, .simm32 } },
                             .{ .src = .{ .simm32, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .simm32 } },
-                            .{ .src = .{ .simm32, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_mem, .gpr } },
-                            .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .mem } },
-                            .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_gpr, .gpr } },
+                            .{ .src = .{ .to_mut_gpr, .simm32 } },
+                            .{ .src = .{ .simm32, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mut_mem, .to_gpr } },
+                            .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .ref = .src0 }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, mir_tag, .dst0q, .src1q, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx2, null },
+                        .required_features = .{ .avx2, null, null, null },
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .yword, .is = .yword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .yword, .is = .yword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2677,16 +2692,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", .v_dqu, .mov, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
                             .{ ._, .vp_, mir_tag, .tmp1y, .tmp1y, .memia(.src1y, .tmp0, .add_size), ._ },
                             .{ ._, .v_dqu, .mov, .memia(.dst0y, .tmp0, .add_size), .tmp1y, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .yword, .is = .yword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .yword, .is = .yword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2699,16 +2719,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", .v_pd, .movu, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
                             .{ ._, .v_pd, mir_tag, .tmp1y, .tmp1y, .memia(.src1y, .tmp0, .add_size), ._ },
                             .{ ._, .v_pd, .movu, .memia(.dst0y, .tmp0, .add_size), .tmp1y, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .xword, .is = .xword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .xword, .is = .xword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2721,16 +2746,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", .v_dqu, .mov, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .vp_, mir_tag, .tmp1x, .tmp1x, .memia(.src1x, .tmp0, .add_size), ._ },
                             .{ ._, .v_dqu, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse2, null },
+                        .required_features = .{ .sse2, null, null, null },
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .xword, .is = .xword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .xword, .is = .xword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2743,16 +2773,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", ._dqu, .mov, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, mir_tag, .tmp1x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._dqu, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse, null },
+                        .required_features = .{ .sse, null, null, null },
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .xword, .is = .xword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .xword, .is = .xword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2765,16 +2800,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", ._ps, .movu, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._ps, mir_tag, .tmp1x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._ps, .movu, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .mmx, null },
+                        .required_features = .{ .mmx, null, null, null },
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .qword, .is = .qword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .qword, .is = .qword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2787,15 +2827,20 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", ._q, .mov, .tmp1q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, mir_tag, .tmp1q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._q, .mov, .memia(.dst0q, .tmp0, .add_size), .tmp1q, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(8), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
+                        .src_constraints = .{
+                            .{ .exact_remainder_int_or_vec = .{ .of = .qword, .is = .qword } },
+                            .{ .exact_remainder_int_or_vec = .{ .of = .qword, .is = .qword } },
+                        },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -2808,16 +2853,17 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.mem},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ .@"0:", ._, .mov, .tmp1p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._, mir_tag, .tmp1p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._, .mov, .memia(.dst0p, .tmp0, .add_size), .tmp1p, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .a(.tmp1, .add_size), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .sa(.tmp1, .add_size), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     } },
-                }) catch |err2| switch (err2) {
+                }) catch |err| switch (err) {
                     error.SelectFailed => return cg.fail("failed to select {s} {} {} {}", .{
                         @tagName(air_tag),
                         cg.typeOf(bin_op.lhs).fmt(pt),
@@ -2830,6 +2876,1058 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 if (ops[1].index != res[0].index) try ops[1].die(cg);
                 try res[0].moveTo(inst, cg);
             },
+            .not => |air_tag| if (use_old) try cg.airUnOp(inst, air_tag) else {
+                const ty_op = air_datas[@intFromEnum(inst)].ty_op;
+                var ops = try cg.tempsFromOperands(inst, .{ty_op.operand});
+                var res: [1]Temp = undefined;
+                cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, comptime &.{ .{
+                    .src_constraints = .{ .{ .signed_or_exact_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .not, .dst0b, ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .xor, .dst0b, .sa(.src0, .add_umax), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_or_exact_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .not, .dst0w, ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .xor, .dst0w, .sa(.src0, .add_umax), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_or_exact_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .not, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .xor, .dst0d, .sa(.src0, .add_umax), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .not, .dst0q, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .dst0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .xor, .dst0q, .src0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .mmx, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_mm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .mmx }},
+                    .each = .{ .once = &.{
+                        .{ ._, .p_d, .cmpeq, .dst0q, .dst0q, ._, ._ },
+                        .{ ._, .p_, .xor, .dst0q, .src0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .mmx, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_mm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .p_, .xor, .dst0q, .lea(.qword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_int = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_xmm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .vp_q, .cmpeq, .dst0x, .dst0x, .dst0x, ._ },
+                        .{ ._, .vp_, .xor, .dst0x, .dst0x, .src0x, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_xmm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .xor, .dst0x, .src0x, .lea(.xword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_int = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_xmm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .p_d, .cmpeq, .dst0x, .dst0x, ._, ._ },
+                        .{ ._, .p_, .xor, .dst0x, .src0x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_xmm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .p_, .xor, .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .{ .int = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_xmm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, ._ps, .xor, .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_int = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .vp_q, .cmpeq, .dst0y, .dst0y, .dst0y, ._ },
+                        .{ ._, .vp_, .xor, .dst0y, .dst0y, .src0y, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .xor, .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_int = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_pd, .cmp, .dst0y, .dst0y, .dst0y, .si(0b01111) },
+                        .{ ._, .v_pd, .xor, .dst0y, .dst0y, .src0y, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_pd, .xor, .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .yword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ ._, .vp_q, .cmpeq, .tmp1y, .tmp1y, .tmp1y, ._ },
+                        .{ .@"0:", .vp_, .xor, .tmp2y, .tmp1y, .memiad(.src0y, .tmp0, .add_size, -16), ._ },
+                        .{ ._, .v_dqu, .mov, .memiad(.dst0y, .tmp0, .add_size, -16), .tmp2y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ .@"0:", .vp_, .xor, .tmp2x, .tmp1x, .memad(.src0x, .add_size, -16), ._ },
+                        .{ ._, .v_dqa, .mov, .memad(.dst0x, .add_size, -16), .tmp2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .yword, .is = .yword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ ._, .vp_q, .cmpeq, .tmp1y, .tmp1y, .tmp1y, ._ },
+                        .{ .@"0:", .vp_, .xor, .tmp2y, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._ },
+                        .{ ._, .v_dqu, .mov, .memia(.dst0y, .tmp0, .add_size), .tmp2y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .yword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ ._, .v_pd, .cmp, .tmp1y, .tmp1y, .tmp1y, .si(0b01111) },
+                        .{ .@"0:", .v_pd, .xor, .tmp2y, .tmp1y, .memiad(.src0y, .tmp0, .add_size, -16), ._ },
+                        .{ ._, .v_pd, .movu, .memiad(.dst0y, .tmp0, .add_size, -16), .tmp2y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ .@"0:", .v_pd, .xor, .tmp2x, .tmp1x, .memad(.src0x, .add_size, -16), ._ },
+                        .{ ._, .v_pd, .mova, .memad(.dst0x, .add_size, -16), .tmp2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .yword, .is = .yword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ ._, .v_pd, .cmp, .tmp1y, .tmp1y, .tmp1y, .si(0b01111) },
+                        .{ .@"0:", .v_pd, .xor, .tmp2y, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._ },
+                        .{ ._, .v_pd, .movu, .memia(.dst0y, .tmp0, .add_size), .tmp2y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ ._, .vp_q, .cmpeq, .tmp1x, .tmp1x, .tmp1x, ._ },
+                        .{ .@"0:", .v_, .xor, .tmp2x, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._ },
+                        .{ ._, .v_dqa, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp2x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .{ .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ ._, .p_d, .cmpeq, .tmp1x, .tmp1x, ._, ._ },
+                        .{ .@"0:", ._dqa, .mov, .tmp2x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .p_, .xor, .tmp2x, .tmp1x, ._, ._ },
+                        .{ ._, ._dqa, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp2x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memia(.dst0q, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, ._, .not, .memiad(.dst0q, .tmp0, .add_size, 8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0q, .tmp0, .add_size), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .xword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16), ._, ._, ._ },
+                        .{ ._, ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16 + 8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .not, .memad(.dst0d, .add_size, -16), ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .xword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -16), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -16), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .memad(.src0d, .add_size, -16), ._, ._ },
+                        .{ ._, ._, .not, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -16), .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -16 + 4), .si(0), ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0q, .add_size, -16 + 8), .si(0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .qword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16), ._, ._, ._ },
+                        .{ ._, ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16 + 8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .not, .memad(.dst0q, .add_size, -16), ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .qword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -8), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .si(0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .dword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .not, .memad(.dst0d, .add_size, -8), ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .dword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -8), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .memad(.src0d, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .not, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -8), .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -8 + 4), .si(0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16), ._, ._, ._ },
+                        .{ ._, ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16 + 8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .memad(.dst0d, .add_size, -16), .sa(.src0, .add_umax), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -16), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -16), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .memad(.src0d, .add_size, -16), ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -16), .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -16 + 4), .si(0), ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0q, .add_size, -16 + 8), .si(0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .qword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .memad(.dst0d, .add_size, -8), .sa(.src0, .add_umax), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .qword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -8), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .memad(.src0d, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -8), .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0d, .add_size, -8 + 4), .si(0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16), ._, ._, ._ },
+                        .{ ._, ._, .not, .memiad(.dst0q, .tmp0, .add_size, -16 + 8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .xor, .memad(.dst0q, .add_size, -16), .tmp0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -16), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -16), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .xor, .tmp0q, .memad(.src0q, .add_size, -16), ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0q, .add_size, -16), .tmp0q, ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .si(0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .not, .memiad(.dst0q, .tmp0, .add_size, -8), ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .xor, .memad(.dst0q, .add_size, -8), .tmp0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sia(8, .src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .memiad(.src0q, .tmp0, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .not, .tmp1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp0, .add_size, -8), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .xor, .tmp0q, .memad(.src0q, .add_size, -8), ._, ._ },
+                        .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .mmx, null, null, null },
+                    .src_constraints = .{ .{ .signed_int_or_full_vec = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_mm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .mmx }},
+                    .each = .{ .once = &.{
+                        .{ ._, .p_d, .cmpeq, .dst0q, .dst0q, ._, ._ },
+                        .{ ._, .p_, .xor, .dst0q, .src0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .mmx, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int_vec = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_mm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .p_, .xor, .dst0q, .lea(.qword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_int_or_full_vec = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_xmm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .vp_q, .cmpeq, .dst0x, .dst0x, .dst0x, ._ },
+                        .{ ._, .vp_, .xor, .dst0x, .dst0x, .src0x, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int_vec = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_xmm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .xor, .dst0x, .src0x, .lea(.xword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .signed_int_or_full_vec = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_xmm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .p_d, .cmpeq, .dst0x, .dst0x, ._, ._ },
+                        .{ ._, .p_, .xor, .dst0x, .src0x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int_vec = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_xmm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .p_, .xor, .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .{ .vec = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_xmm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, ._ps, .xor, .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .signed_int_or_full_vec = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .vp_q, .cmpeq, .dst0y, .dst0y, .dst0y, ._ },
+                        .{ ._, .vp_, .xor, .dst0y, .dst0y, .src0y, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int_vec = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .xor, .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .signed_int_or_full_vec = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_pd, .cmp, .dst0y, .dst0y, .dst0y, .si(0b01111) },
+                        .{ ._, .v_pd, .xor, .dst0y, .dst0y, .src0y, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int_vec = .yword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_ymm, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_pd, .xor, .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .mem(.tmp3), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_src0_size), ._, ._ },
+                        .{ ._, ._, .xor, .tmp2q, .leaia(.qword, .tmp1, .tmp0, .add_src0_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0q, .tmp0, .add_src0_size), .tmp2q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .mem(.tmp3), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .memia(.src0d, .tmp0, .add_src0_size), ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .leaia(.dword, .tmp1, .tmp0, .add_src0_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0d, .tmp0, .add_src0_size), .tmp2d, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(4), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                } }) catch |err| switch (err) {
+                    error.SelectFailed => return cg.fail("failed to select {s} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(ty_op.operand).fmt(pt),
+                        ops[0].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
+                if (ops[0].index != res[0].index) try ops[0].die(cg);
+                try res[0].moveTo(inst, cg);
+            },
+
             .block => if (use_old) try cg.airBlock(inst) else {
                 const ty_pl = air_datas[@intFromEnum(inst)].ty_pl;
                 const extra = cg.air.extraData(Air.Block, ty_pl.payload);
@@ -2880,91 +3978,2760 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
             .call_never_tail => try cg.airCall(inst, .never_tail),
             .call_never_inline => try cg.airCall(inst, .never_inline),
 
-            .cmp_vector, .cmp_vector_optimized => |air_tag| if (use_old) try cg.airCmpVector(inst) else fallback: {
-                const ty_pl = air_datas[@intFromEnum(inst)].ty_pl;
-                const extra = cg.air.extraData(Air.VectorCmp, ty_pl.payload).data;
-                switch (extra.compareOperator()) {
-                    .eq, .neq => {},
-                    else => break :fallback try cg.airCmpVector(inst),
-                }
-                var ops = try cg.tempsFromOperands(inst, .{ extra.lhs, extra.rhs });
+            .clz => |air_tag| if (use_old) try cg.airClz(inst) else {
+                const ty_op = air_datas[@intFromEnum(inst)].ty_op;
+                var ops = try cg.tempsFromOperands(inst, .{ty_op.operand});
                 var res: [1]Temp = undefined;
-                switch (extra.compareOperator()) {
-                    .lt => unreachable,
-                    .lte => unreachable,
-                    .eq, .neq => |cmp_op| cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (@as(Condition, switch (cmp_op) {
-                        else => unreachable,
-                        .eq => .e,
-                        .neq => .ne,
-                    })) {
-                        else => unreachable,
-                        inline .e, .ne => |cc| comptime &.{ .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
-                            .patterns = &.{
-                                .{ .src = .{ .ymm, .mem } },
-                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .ymm, .ymm } },
-                            },
-                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
-                                .kind = .all,
-                                .inverted = switch (cc) {
-                                    else => unreachable,
-                                    .e => false,
-                                    .ne => true,
-                                },
-                                .scalar = .byte,
-                            } } }},
-                            .each = .{ .once = &.{
-                                .{ ._, .vp_b, .cmpeq, .dst0y, .src0y, .src1y, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
-                            .patterns = &.{
-                                .{ .src = .{ .ymm, .mem } },
-                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .ymm, .ymm } },
-                            },
-                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
-                                .kind = .all,
-                                .inverted = switch (cc) {
-                                    else => unreachable,
-                                    .e => false,
-                                    .ne => true,
-                                },
-                                .scalar = .word,
-                            } } }},
-                            .each = .{ .once = &.{
-                                .{ ._, .vp_w, .cmpeq, .dst0y, .src0y, .src1y, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
-                            .patterns = &.{
-                                .{ .src = .{ .ymm, .mem } },
-                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .ymm, .ymm } },
-                            },
-                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
-                                .kind = .all,
-                                .inverted = switch (cc) {
-                                    else => unreachable,
-                                    .e => false,
-                                    .ne => true,
-                                },
-                                .scalar = .dword,
-                            } } }},
-                            .each = .{ .once = &.{
-                                .{ ._, .vp_d, .cmpeq, .dst0y, .src0y, .src1y, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
-                            .patterns = &.{
-                                .{ .src = .{ .ymm, .mem } },
-                                .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .ymm, .ymm } },
+                cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, comptime &.{ .{
+                    .required_features = .{ .slow_incdec, null, null, null },
+                    .src_constraints = .{ .{ .exact_signed_int = 1 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .add, .dst0b, .si(1), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .exact_signed_int = 1 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .inc, .dst0b, ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .exact_unsigned_int = 1 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .none } },
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .xor, .dst0b, .si(1), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .dst0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .false_deps_lzcnt_tzcnt, .lzcnt, null, null },
+                    .src_constraints = .{ .{ .exact_int = 16 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0w, .src0w, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .exact_int = 16 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0w, .src0w, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0w, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(16, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .false_deps_lzcnt_tzcnt, .lzcnt, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(16, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(16, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .false_deps_lzcnt_tzcnt, .lzcnt, null, null },
+                    .src_constraints = .{ .{ .exact_int = 32 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0d, .src0d, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .exact_int = 32 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0d, .src0d, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .false_deps_lzcnt_tzcnt, .lzcnt, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .src_constraints = .{ .{ .exact_int = 64 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0q, .src0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .exact_int = 64 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0q, .src0q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .dst0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .dst0q, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(64, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(64, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lzcnt, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .sia(64, .src0, .sub_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._z, .cmov, .dst0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .signed_po2_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .dst0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._z, .cmov, .dst0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .signed_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .tmp0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src0b, ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .signed_po2_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .dst0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .tmp0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src0b, ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src0b, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_po2_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .tmp0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._, .@"and", .dst0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .dst0d, .src0b, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0w, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._nz, .cmov, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .signed_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0w, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .src0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0w, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .src0w, .dst0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0w, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .src0w, .dst0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0w, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .src0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .dst0w, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0w, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp0w, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0w, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0w, .src0w, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._nz, .cmov, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .signed_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .src0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .src0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .src0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .src0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .dst0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"and", .src0d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0d, .src0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .cmov, .bsf_bsr_0_clobbers_result, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0q, .src0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._nz, .cmov, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .cmov, .bsf_bsr_0_clobbers_result, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp0q, .src0q, ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0q, .tmp0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .cmov, .bsf_bsr_0_clobbers_result, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0q, .src0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(0xff), ._, ._ },
+                        .{ ._, ._z, .cmov, .src0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp0q, .src0q, ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0q, .tmp0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bs, .src0q, .src0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .dst0b, .src0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_po2_or_exact_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .dst0d, .sia(-1, .src0, .add_2_bit_size), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._, .xor, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .dst0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0q, .dst0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .none } },
+                        .{ .src = .{ .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp0q, .src0q, ._, ._ },
+                        .{ ._, ._, .mov, .dst0b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .dst0b, .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .lzcnt, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(-1), ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .dst0d, .si(-1), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .lzcnt, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0d, .si(-1), ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_or_exact_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ .@"0:", ._, .mov, .dst0d, .si(-1), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .tmp1q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"0:", ._, .@"and", .tmp1q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .tmp1q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-16, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .dst0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"0:", ._, .@"and", .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .dst0q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"0:", ._, .xor, .dst0d, .dst0d, ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .tmp1q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"0:", ._, .@"and", .tmp1q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._, .lzcnt, .dst0q, .tmp1q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .neg, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .add_src0_bit_size, -64), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .rc = .general_purpose }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .dst0q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"0:", ._, .@"and", .dst0q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._r, .bs, .dst0q, .dst0q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                        .{ .@"0:", ._, .lea, .dst0d, .leasiad(.none, .dst0, .@"8", .tmp0, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .dst0d, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .lzcnt, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(32, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, .slow_incdec, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1q, .tmp1q, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(64, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp1q, .tmp1q, ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .sia(64, .src0, .sub_bit_size), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, .slow_incdec, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp2d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp2d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .slow_incdec, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .scalar_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0b, .tmp0, .add_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, .slow_incdec, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp2d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp2d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .slow_incdec, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .scalar_int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memsia(.src0w, .@"2", .tmp0, .add_2_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, .slow_incdec, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bsf_bsr_0_clobbers_result, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .slow_incdec, null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .scalar_int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1d, .memsia(.src0d, .@"4", .tmp0, .add_4_len), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1d, .sa(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .cmov, .bsf_bsr_0_clobbers_result, .slow_incdec },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2q, .tmp2q, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .cmov, .bsf_bsr_0_clobbers_result, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .si(0xff), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp2q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2q, .tmp2q, ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp2d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp3b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, .slow_incdec, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1q, .tmp1q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bsf_bsr_0_clobbers_result, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp1q, .tmp1q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp2b, .sa(.src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._c, .st, ._, ._, ._, ._ },
+                        .{ ._, ._, .sbb, .tmp2b, .tmp1b, ._, ._ },
+                        .{ .@"1:", ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp2b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2q, .tmp1q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .scalar_int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp1q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ ._, ._, .@"and", .tmp1q, .memsia(.src0q, .@"8", .tmp0, .add_8_len), ._, ._ },
+                        .{ ._, ._, .mov, .tmp2d, .si(0xff), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp2q, .tmp1q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1b, .sia(-1, .src0, .add_bit_size), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1b, .tmp2b, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp1b, ._, ._ },
+                        .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .dst_constraints = .{.{ .scalar_int = .byte }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-16, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .byte }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-16, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .byte }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-16, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp3q, .tmp3q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .lea, .tmp3d, .leasiad(.none, .tmp3, .@"8", .tmp2, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .tmp3b, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .dst_constraints = .{.{ .scalar_int = .byte }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-8, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .byte }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-8, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .byte }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-8, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp3q, .tmp3q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .lea, .tmp3d, .leasiad(.none, .tmp3, .@"8", .tmp2, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .tmp3b, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0b, .tmp0, .add_len), .tmp3b, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .dst_constraints = .{.{ .scalar_int = .word }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-16, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0w, .@"2", .tmp0, .add_2_len), .tmp3w, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .word }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-16, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0w, .@"2", .tmp0, .add_2_len), .tmp3w, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .word }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-16, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp3q, .tmp3q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .lea, .tmp3d, .leasiad(.none, .tmp3, .@"8", .tmp2, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0w, .@"2", .tmp0, .add_2_len), .tmp3w, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .false_deps_lzcnt_tzcnt, .lzcnt, null },
+                    .dst_constraints = .{.{ .scalar_int = .word }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-8, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0w, .@"2", .tmp0, .add_2_len), .tmp3w, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .lzcnt, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .word }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-8, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._, .lzcnt, .tmp4q, .tmp3q, ._, ._ },
+                        .{ ._, ._nc, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .neg, .tmp2d, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3d, .leasiad(.none, .tmp4, .@"8", .tmp2, .add_src0_bit_size, -64), ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0w, .@"2", .tmp0, .add_2_len), .tmp3w, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .dst_constraints = .{.{ .scalar_int = .word }},
+                    .src_constraints = .{ .{ .scalar_remainder_int = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_len), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .mem(.src0), ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp2d, .sia(-8, .none, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .ua(.src0, .add_umax), ._, ._ },
+                        .{ .@"1:", ._, .@"and", .tmp3q, .leai(.qword, .tmp1, .tmp2), ._, ._ },
+                        .{ ._, ._r, .bs, .tmp3q, .tmp3q, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .si(-1), ._, ._ },
+                        .{ ._, ._, .sub, .tmp2d, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"1b", ._, ._, ._ },
+                        .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ .@"1:", ._, .lea, .tmp3d, .leasiad(.none, .tmp3, .@"8", .tmp2, .sub_src0_bit_size, 1), ._, ._ },
+                        .{ ._, ._, .neg, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0w, .@"2", .tmp0, .add_2_len), .tmp3w, ._, ._ },
+                        .{ ._, ._, .lea, .tmp1q, .leaa(.none, .tmp1, .add_src0_elem_size), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                } }) catch |err| switch (err) {
+                    error.SelectFailed => return cg.fail("failed to select {s} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(ty_op.operand).fmt(pt),
+                        ops[0].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
+                if (ops[0].index != res[0].index) try ops[0].die(cg);
+                try res[0].moveTo(inst, cg);
+            },
+
+            .cmp_vector, .cmp_vector_optimized => |air_tag| if (use_old) try cg.airCmpVector(inst) else fallback: {
+                const ty_pl = air_datas[@intFromEnum(inst)].ty_pl;
+                const extra = cg.air.extraData(Air.VectorCmp, ty_pl.payload).data;
+                switch (extra.compareOperator()) {
+                    .eq, .neq => {},
+                    else => break :fallback try cg.airCmpVector(inst),
+                }
+                var ops = try cg.tempsFromOperands(inst, .{ extra.lhs, extra.rhs });
+                var res: [1]Temp = undefined;
+                switch (extra.compareOperator()) {
+                    .lt => unreachable,
+                    .lte => unreachable,
+                    .eq, .neq => |cmp_op| cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (@as(Condition, switch (cmp_op) {
+                        else => unreachable,
+                        .eq => .e,
+                        .neq => .ne,
+                    })) {
+                        else => unreachable,
+                        inline .e, .ne => |cc| comptime &.{ .{
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
+                            .patterns = &.{
+                                .{ .src = .{ .to_ymm, .mem } },
+                                .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_ymm, .to_ymm } },
+                            },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .byte,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ ._, .vp_b, .cmpeq, .dst0y, .src0y, .src1y, ._ },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
+                            .patterns = &.{
+                                .{ .src = .{ .to_ymm, .mem } },
+                                .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_ymm, .to_ymm } },
+                            },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .word,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ ._, .vp_w, .cmpeq, .dst0y, .src0y, .src1y, ._ },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
+                            .patterns = &.{
+                                .{ .src = .{ .to_ymm, .mem } },
+                                .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_ymm, .to_ymm } },
+                            },
+                            .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
+                                .kind = .all,
+                                .inverted = switch (cc) {
+                                    else => unreachable,
+                                    .e => false,
+                                    .ne => true,
+                                },
+                                .scalar = .dword,
+                            } } }},
+                            .each = .{ .once = &.{
+                                .{ ._, .vp_d, .cmpeq, .dst0y, .src0y, .src1y, ._ },
+                            } },
+                        }, .{
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
+                            .patterns = &.{
+                                .{ .src = .{ .to_ymm, .mem } },
+                                .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_ymm, .to_ymm } },
                             },
                             .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
                                 .kind = .all,
@@ -2979,12 +6746,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .vp_q, .cmpeq, .dst0y, .src0y, .src1y, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
-                                .{ .src = .{ .xmm, .mem } },
-                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .xmm, .xmm } },
+                                .{ .src = .{ .to_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
                                 .kind = .all,
@@ -2999,12 +6766,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .vp_b, .cmpeq, .dst0x, .src0x, .src1x, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
-                                .{ .src = .{ .xmm, .mem } },
-                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .xmm, .xmm } },
+                                .{ .src = .{ .to_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
                                 .kind = .all,
@@ -3019,12 +6786,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .vp_w, .cmpeq, .dst0x, .src0x, .src1x, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
-                                .{ .src = .{ .xmm, .mem } },
-                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .xmm, .xmm } },
+                                .{ .src = .{ .to_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
                                 .kind = .all,
@@ -3039,12 +6806,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .vp_d, .cmpeq, .dst0x, .src0x, .src1x, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
-                                .{ .src = .{ .xmm, .mem } },
-                                .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .xmm, .xmm } },
+                                .{ .src = .{ .to_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .rc_mask = .{ .rc = .sse, .info = .{
                                 .kind = .all,
@@ -3059,12 +6826,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .vp_q, .cmpeq, .dst0x, .src0x, .src1x, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .sse2, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .sse2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_xmm, .mem } },
-                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_xmm, .xmm } },
+                                .{ .src = .{ .to_mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3079,12 +6846,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .p_b, .cmpeq, .dst0x, .src1x, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .sse2, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .sse2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_xmm, .mem } },
-                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_xmm, .xmm } },
+                                .{ .src = .{ .to_mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3099,12 +6866,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .p_w, .cmpeq, .dst0x, .src1x, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .sse2, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .sse2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_xmm, .mem } },
-                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_xmm, .xmm } },
+                                .{ .src = .{ .to_mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3119,12 +6886,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .p_d, .cmpeq, .dst0x, .src1x, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .sse4_1, null },
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .required_features = .{ .sse4_1, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_xmm, .mem } },
-                                .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_xmm, .xmm } },
+                                .{ .src = .{ .to_mut_xmm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_xmm, .to_xmm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3139,12 +6906,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .p_q, .cmpeq, .dst0x, .src1x, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .mmx, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .mmx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_mm, .mem } },
-                                .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mm, .mm } },
+                                .{ .src = .{ .to_mut_mm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_mm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_mm, .to_mm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3159,12 +6926,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .p_b, .cmpeq, .dst0q, .src1q, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .mmx, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .mmx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_mm, .mem } },
-                                .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mm, .mm } },
+                                .{ .src = .{ .to_mut_mm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_mm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_mm, .to_mm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3179,12 +6946,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .p_w, .cmpeq, .dst0q, .src1q, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .mmx, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .mmx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
-                                .{ .src = .{ .mut_mm, .mem } },
-                                .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mm, .mm } },
+                                .{ .src = .{ .to_mut_mm, .mem } },
+                                .{ .src = .{ .mem, .to_mut_mm }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_mm, .to_mm } },
                             },
                             .dst_temps = .{.{ .ref_mask = .{ .ref = .src0, .info = .{
                                 .kind = .all,
@@ -3203,550 +6970,182 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .patterns = &.{
                                 .{ .src = .{ .mut_mem, .imm8 } },
                                 .{ .src = .{ .imm8, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .imm8 } },
-                                .{ .src = .{ .imm8, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mem, .gpr } },
-                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .mem } },
-                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .gpr } },
-                            },
-                            .clobbers = .{ .eflags = true },
-                            .dst_temps = .{.{ .ref = .src0 }},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .xor, .dst0b, .src1b, ._, ._ },
-                                    .{ ._, ._, .not, .dst0b, ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .xor, .dst0b, .src1b, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .src_constraints = .{ .{ .bool_vec = .word }, .{ .bool_vec = .word } },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .imm16 } },
-                                .{ .src = .{ .imm16, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .imm16 } },
-                                .{ .src = .{ .imm16, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mem, .gpr } },
-                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .mem } },
-                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .gpr } },
-                            },
-                            .clobbers = .{ .eflags = true },
-                            .dst_temps = .{.{ .ref = .src0 }},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .xor, .dst0w, .src1w, ._, ._ },
-                                    .{ ._, ._, .not, .dst0w, ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .xor, .dst0w, .src1w, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .src_constraints = .{ .{ .bool_vec = .dword }, .{ .bool_vec = .dword } },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .imm32 } },
-                                .{ .src = .{ .imm32, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .imm32 } },
-                                .{ .src = .{ .imm32, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mem, .gpr } },
-                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .mem } },
-                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .gpr } },
-                            },
-                            .clobbers = .{ .eflags = true },
-                            .dst_temps = .{.{ .ref = .src0 }},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .xor, .dst0d, .src1d, ._, ._ },
-                                    .{ ._, ._, .not, .dst0d, ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .xor, .dst0d, .src1d, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .required_features = .{ .@"64bit", null },
-                            .src_constraints = .{ .{ .bool_vec = .qword }, .{ .bool_vec = .qword } },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .simm32 } },
-                                .{ .src = .{ .simm32, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .simm32 } },
-                                .{ .src = .{ .simm32, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_mem, .gpr } },
-                                .{ .src = .{ .gpr, .mut_mem }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .mem } },
-                                .{ .src = .{ .mem, .mut_gpr }, .commute = .{ 0, 1 } },
-                                .{ .src = .{ .mut_gpr, .gpr } },
+                                .{ .src = .{ .to_mut_gpr, .imm8 } },
+                                .{ .src = .{ .imm8, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .to_gpr } },
+                                .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .to_gpr } },
                             },
-                            .clobbers = .{ .eflags = true },
                             .dst_temps = .{.{ .ref = .src0 }},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .xor, .dst0q, .src1q, ._, ._ },
-                                    .{ ._, ._, .not, .dst0q, ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .xor, .dst0q, .src1q, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .src_constraints = .{ .any_bool_vec, .any_bool_vec },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
                             .clobbers = .{ .eflags = true },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.mem},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ .@"0:", ._, .mov, .tmp1p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._, .not, .tmp1p, ._, ._, ._ },
-                                    .{ ._, ._, .mov, .memia(.dst0p, .tmp0, .add_size), .tmp1p, ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .a(.tmp1, .add_size), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ .@"0:", ._, .mov, .tmp1p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._, .mov, .memia(.dst0p, .tmp0, .add_size), .tmp1p, ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .a(.tmp1, .add_size), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.mem},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_b, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0d, .tmp1), .tmp2d, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_b, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
-                                    .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0d, .tmp1), .tmp2d, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.mem},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_w, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .ackssw, .tmp3y, .tmp3y, .tmp3y, ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_w, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .ackssw, .tmp3y, .tmp3y, .tmp3y, ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
-                                    .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.mem},
-                            .each = .{ .once = switch (cc) {
-                                else => unreachable,
-                                .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_d, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_ps, .movmsk, .tmp2d, .tmp3y, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                },
-                                .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_d, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_ps, .movmsk, .tmp2d, .tmp3y, ._, ._ },
-                                    .{ ._, ._, .not, .tmp2b, ._, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .xor, .dst0b, .src1b, ._, ._ },
+                                    .{ ._, ._, .not, .dst0b, ._, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .xor, .dst0b, .src1b, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .avx2, null },
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .src_constraints = .{ .{ .bool_vec = .word }, .{ .bool_vec = .word } },
                             .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
-                                .unused,
+                                .{ .src = .{ .mut_mem, .imm16 } },
+                                .{ .src = .{ .imm16, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .imm16 } },
+                                .{ .src = .{ .imm16, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .to_gpr } },
+                                .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .to_gpr } },
                             },
-                            .dst_temps = .{.mem},
+                            .dst_temps = .{.{ .ref = .src0 }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp4y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_q, .cmpeq, .tmp4y, .tmp4y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4y, ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .dst0w, .src1w, ._, ._ },
+                                    .{ ._, ._, .not, .dst0w, ._, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp4y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_q, .cmpeq, .tmp4y, .tmp4y, .memia(.src1y, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4y, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp3b, .i(0b1111), ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(32), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .dst0w, .src1w, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .src_constraints = .{ .{ .bool_vec = .dword }, .{ .bool_vec = .dword } },
                             .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
-                                .unused,
-                                .unused,
+                                .{ .src = .{ .mut_mem, .imm32 } },
+                                .{ .src = .{ .imm32, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .imm32 } },
+                                .{ .src = .{ .imm32, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .to_gpr } },
+                                .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .to_gpr } },
                             },
-                            .dst_temps = .{.mem},
+                            .dst_temps = .{.{ .ref = .src0 }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_b, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .xor, .dst0d, .src1d, ._, ._ },
+                                    .{ ._, ._, .not, .dst0d, ._, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_b, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
-                                    .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .xor, .dst0d, .src1d, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .@"64bit", null, null, null },
+                            .src_constraints = .{ .{ .bool_vec = .qword }, .{ .bool_vec = .qword } },
                             .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
-                                .unused,
-                                .unused,
+                                .{ .src = .{ .mut_mem, .simm32 } },
+                                .{ .src = .{ .simm32, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .simm32 } },
+                                .{ .src = .{ .simm32, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .mut_mem, .to_gpr } },
+                                .{ .src = .{ .to_gpr, .mut_mem }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .mem } },
+                                .{ .src = .{ .mem, .to_mut_gpr }, .commute = .{ 0, 1 } },
+                                .{ .src = .{ .to_mut_gpr, .to_gpr } },
                             },
-                            .dst_temps = .{.mem},
+                            .dst_temps = .{.{ .ref = .src0 }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_w, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .ackssw, .tmp3x, .tmp3x, .tmp3x, ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .xor, .dst0q, .src1q, ._, ._ },
+                                    .{ ._, ._, .not, .dst0q, ._, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_w, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .vp_b, .ackssw, .tmp3x, .tmp3x, .tmp3x, ._ },
-                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
-                                    .{ ._, ._, .not, .tmp2b, ._, ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .xor, .dst0q, .src1q, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .src_constraints = .{ .any_bool_vec, .any_bool_vec },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .sse } },
+                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .unused,
+                                .unused,
+                                .unused,
                                 .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_d, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ .@"0:", ._, .mov, .tmp1p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._, .not, .tmp1p, ._, ._, ._ },
+                                    .{ ._, ._, .mov, .memia(.dst0p, .tmp0, .add_size), .tmp1p, ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .sa(.tmp1, .add_size), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_d, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp3b, .i(0b1111), ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ .@"0:", ._, .mov, .tmp1p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._, .mov, .memia(.dst0p, .tmp0, .add_size), .tmp1p, ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .sa(.tmp1, .add_size), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .avx, null },
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .kind = .{ .rc = .sse } },
                                 .unused,
+                                .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_q, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_b, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0d, .tmp1), .tmp2d, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .vp_q, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
-                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp3b, .i(0b11), ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
-                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_b, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
+                                    .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0d, .tmp1), .tmp2d, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse2, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -3759,35 +7158,38 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_w, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .ackssw, .tmp3y, .tmp3y, .tmp3y, ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_w, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .ackssw, .tmp3y, .tmp3y, .tmp3y, ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3y, ._, ._ },
                                     .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse2, null },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -3800,37 +7202,36 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_w, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .ackssw, .tmp3x, .tmp3x, ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_d, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_ps, .movmsk, .tmp2d, .tmp3y, ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_w, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .ackssw, .tmp3x, .tmp3x, ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_d, .cmpeq, .tmp3y, .tmp3y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_ps, .movmsk, .tmp2d, .tmp3y, ._, ._ },
                                     .{ ._, ._, .not, .tmp2b, ._, ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse2, null },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .avx2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -3843,128 +7244,104 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_d, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp4y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_q, .cmpeq, .tmp4y, .tmp4y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4y, ._, ._ },
                                     .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
                                     .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
                                     .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
                                     .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
                                     .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_d, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp3b, .i(0b1111), ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp4y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_q, .cmpeq, .tmp4y, .tmp4y, .memia(.src1y, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4y, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp3b, .si(0b1111), ._, ._ },
                                     .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
                                     .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
                                     .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
                                     .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(32), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
                                     .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse4_1, null },
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
                                 .{ .kind = .{ .rc = .sse } },
                                 .unused,
+                                .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_q, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_b, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_q, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, ._pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp3b, .i(0b11), ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_b, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(16), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse, .mmx },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -3972,111 +7349,111 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .mmx } },
+                                .{ .kind = .{ .rc = .sse } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", ._q, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .cmpeq, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3q, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_w, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .ackssw, .tmp3x, .tmp3x, .tmp3x, ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(8), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                    .{ .@"0:", ._q, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .cmpeq, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3q, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_w, .cmpeq, .tmp3x, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .vp_b, .ackssw, .tmp3x, .tmp3x, .tmp3x, ._ },
+                                    .{ ._, .vp_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
                                     .{ ._, ._, .not, .tmp2b, ._, ._, ._ },
                                     .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
-                                    .{ ._, ._, .add, .tmp0p, .i(8), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse, .mmx },
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .mmx } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .mmx } },
+                                .{ .kind = .{ .rc = .sse } },
+                                .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
-                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_w, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_d, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(8), ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
-                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_w, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp4b, .i(0b1111), ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_d, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp3b, .si(0b1111), ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(8), ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                             } },
                         }, .{
-                            .required_features = .{ .sse, .mmx },
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .avx, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -4084,578 +7461,535 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u32, .kind = .{ .reg = .rcx } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .mmx } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .{ .kind = .{ .rc = .mmx } },
+                                .{ .kind = .{ .rc = .sse } },
+                                .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = switch (cc) {
                                 else => unreachable,
                                 .e => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
-                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_d, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_w, .ackssd, .tmp5q, .tmp3q, ._, ._ },
-                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_q, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(8), ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
                                     .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
                                 },
                                 .ne => &.{
-                                    .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                     .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                     .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
-                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_d, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                    .{ ._, .p_w, .ackssd, .tmp5q, .tmp3q, ._, ._ },
-                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
-                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp4b, .i(0b11), ._, ._ },
-                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
-                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ .@"0:", .v_dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .vp_q, .cmpeq, .tmp4x, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._ },
+                                    .{ ._, .v_pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp3b, .si(0b11), ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
                                     .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
                                     .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
-                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
-                                    .{ .@"1:", ._, .add, .tmp0p, .i(8), ._, ._ },
-                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                    .{ ._, ._, .@"test", .tmp1d, .i(0b111), ._, ._ },
-                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
-                                    .{ ._, ._r, .sh, .tmp4d, .i(3), ._, ._ },
-                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
-                                },
-                            } },
-                        }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(1), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .sse2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .unused,
+                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .sse } },
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(4), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ ._, ._, .not, .tmp2d, ._, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0w, .tmp1), .tmp2w, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .required_features = .{ .sse2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .unused,
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .sse } },
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(4), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_w, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .ackssw, .tmp3x, .tmp3x, ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp3x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_w, .cmpeq, .tmp3x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .ackssw, .tmp3x, .tmp3x, ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3x, ._, ._ },
+                                    .{ ._, ._, .not, .tmp2b, ._, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .required_features = .{ .sse2, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
+                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .sse } },
                                 .unused,
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_d, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_d, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._ps, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp3b, .si(0b1111), ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .required_features = .{ .sse4_1, null, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
-                                .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .a(.src0p, .add_elem_limbs), ._, ._ },
-                                .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
-                                .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
-                                .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
-                                .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .a(.tmp4, .add_size), ._, ._ },
-                                .{ ._, ._, .sub, .tmp2d, .i(1), ._, ._ },
-                                .{ ._, ._b, .j, .@"1b", ._, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp1b, .a(.dst0, .add_len), ._, ._ },
-                                .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .dst_constraints = .{.{ .bool_vec = .byte }},
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .sse } },
                                 .unused,
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
-                                .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .a(.src0p, .add_elem_limbs), ._, ._ },
-                                .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
-                                .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
-                                .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
-                                .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .a(.tmp4, .add_size), ._, ._ },
-                                .{ ._, ._, .dec, .tmp2d, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .cmp, .tmp1b, .a(.dst0, .add_len), ._, ._ },
-                                .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_q, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"0:", ._dqu, .mov, .tmp4x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_q, .cmpeq, .tmp4x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, ._pd, .movmsk, .tmp3d, .tmp4x, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp3b, .si(0b11), ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp3b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp3b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp3, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(16), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp3), .tmp2b, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .sse, .mmx, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .mmx } },
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(1), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ .@"0:", ._q, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .cmpeq, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3q, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ .@"0:", ._q, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .cmpeq, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp2d, .tmp3q, ._, ._ },
+                                    .{ ._, ._, .not, .tmp2b, ._, ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 1), ._, ._ },
+                                    .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .required_features = .{ .sse, .mmx, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
+                                .{ .kind = .{ .rc = .mmx } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .mmx } },
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1d, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
+                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_w, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(8), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
+                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_w, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp4b, .si(0b1111), ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 4), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(8), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .sse, .mmx, null, null },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                                .unused,
+                                .{ .type = .u32, .kind = .{ .reg = .rcx } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .mmx } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .kind = .{ .rc = .mmx } },
                             },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
+                            .each = .{ .once = switch (cc) {
+                                else => unreachable,
+                                .e => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
+                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_d, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_w, .ackssd, .tmp5q, .tmp3q, ._, ._ },
+                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(8), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                },
+                                .ne => &.{
+                                    .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                    .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ ._, .p_, .xor, .tmp3q, .tmp3q, ._, ._ },
+                                    .{ .@"0:", ._q, .mov, .tmp5q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_d, .cmpeq, .tmp5q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                    .{ ._, .p_w, .ackssd, .tmp5q, .tmp3q, ._, ._ },
+                                    .{ ._, .p_b, .ackssw, .tmp5q, .tmp3q, ._, ._ },
+                                    .{ ._, .p_b, .movmsk, .tmp4d, .tmp5q, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp4b, .si(0b11), ._, ._ },
+                                    .{ ._, ._l, .ro, .tmp4b, .tmp1b, ._, ._ },
+                                    .{ ._, ._, .@"or", .tmp2b, .tmp4b, ._, ._ },
+                                    .{ ._, ._, .lea, .tmp1d, .lead(.none, .tmp1, 2), ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memid(.dst0b, .tmp4, -1), .tmp2b, ._, ._ },
+                                    .{ ._, ._, .xor, .tmp2b, .tmp2b, ._, ._ },
+                                    .{ .@"1:", ._, .add, .tmp0p, .si(8), ._, ._ },
+                                    .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                    .{ ._, ._, .@"test", .tmp1d, .si(0b111), ._, ._ },
+                                    .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
+                                    .{ ._, ._, .mov, .tmp4d, .tmp1d, ._, ._ },
+                                    .{ ._, ._r, .sh, .tmp4d, .si(3), ._, ._ },
+                                    .{ ._, ._, .mov, .memi(.dst0b, .tmp4), .tmp2b, ._, ._ },
+                                },
                             } },
                         }, .{
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .dst_constraints = .{.{ .bool_vec = .byte }},
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
-                                .unused,
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
                                 .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp2b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(4), ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .dst_constraints = .{.{ .bool_vec = .byte }},
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .unused,
+                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
                                 .unused,
-                            },
-                            .dst_temps = .{.{ .rc = .general_purpose }},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(4), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                            } },
-                        }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp2w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(2), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .dst_constraints = .{.{ .bool_vec = .byte }},
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -4663,249 +7997,251 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .unused,
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp2d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(4), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
+                            .required_features = .{ .@"64bit", null, null, null },
+                            .dst_constraints = .{.{ .bool_vec = .byte }},
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .unused,
+                                .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .a(.src0p, .add_elem_limbs), ._, ._ },
-                                .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
-                                .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
-                                .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
-                                .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .a(.tmp4, .add_size), ._, ._ },
-                                .{ ._, ._, .sub, .tmp2d, .i(1), ._, ._ },
-                                .{ ._, ._b, .j, .@"1b", ._, ._, ._ },
-                                .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp1b, .a(.dst0, .add_len), ._, ._ },
-                                .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(2), ._, ._ },
+                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .dst_constraints = .{.{ .bool_vec = .dword }},
+                            .dst_constraints = .{.{ .bool_vec = .byte }},
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
-                                .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
+                                .{ ._, ._, .xor, .dst0b, .dst0b, ._, ._ },
                                 .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .a(.src0p, .add_elem_limbs), ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2d, .sa(.src0p, .add_elem_limbs), ._, ._ },
                                 .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
                                 .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
                                 .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
                                 .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .a(.tmp4, .add_size), ._, ._ },
-                                .{ ._, ._, .dec, .tmp2d, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
-                                .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .sa(.tmp4, .add_size), ._, ._ },
+                                .{ ._, ._, .sub, .tmp2d, .si(1), ._, ._ },
+                                .{ ._, ._b, .j, .@"1b", ._, ._, ._ },
                                 .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0d, .tmp2b, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .cmp, .tmp1b, .a(.dst0, .add_len), ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2b, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0b, .tmp2b, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp1b, .sa(.dst0, .add_len), ._, ._ },
                                 .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
-                            .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .dst_constraints = .{.{ .bool_vec = .dword }},
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
                                 .{ ._, ._, .mov, .tmp3b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
                                 .{ ._, ._, .cmp, .tmp3b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(1), ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
-                            .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .dst_constraints = .{.{ .bool_vec = .dword }},
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp3w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp3w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .inc, .tmp0p, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(2), ._, ._ },
+                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
-                            .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .dst_constraints = .{.{ .bool_vec = .dword }},
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp3d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp3d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(4), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
-                            .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
+                            .required_features = .{ .@"64bit", null, null, null },
+                            .dst_constraints = .{.{ .bool_vec = .dword }},
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp3q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(2), ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(2), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
-                            .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .dst_constraints = .{.{ .bool_vec = .dword }},
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                                .unused,
+                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2d, .sa(.src0p, .add_elem_limbs), ._, ._ },
+                                .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                                .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
+                                .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
+                                .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .sa(.tmp4, .add_size), ._, ._ },
+                                .{ ._, ._, .sub, .tmp2d, .si(1), ._, ._ },
+                                .{ ._, ._b, .j, .@"1b", ._, ._, ._ },
+                                .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(4), ._, ._ },
-                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                                .{ ._, ._l, .sh, .tmp2d, .tmp1b, ._, ._ },
+                                .{ ._, ._, .@"or", .dst0d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp1b, .sa(.dst0, .add_len), ._, ._ },
+                                .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
+                            .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -4913,29 +8249,30 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp3d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp3b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp3b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(4), ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
+                            .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .src_constraints = .{ .{ .scalar_int = .word }, .{ .scalar_int = .word } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -4943,29 +8280,29 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp3w, .memia(.src0w, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp3w, .memia(.src1w, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(8), ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(2), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
+                            .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{.{ .bool_vec = .qword }},
-                            .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
+                            .src_constraints = .{ .{ .scalar_int = .dword }, .{ .scalar_int = .dword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -4973,63 +8310,59 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                                 .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp3d, .memia(.src0d, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp3d, .memia(.src1d, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .i(8), ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(4), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", .slow_incdec },
+                            .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{.{ .bool_vec = .qword }},
+                            .src_constraints = .{ .{ .scalar_int = .qword }, .{ .scalar_int = .qword } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
                             .extra_temps = .{
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                                .unused,
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                                .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .a(.src0p, .add_elem_limbs), ._, ._ },
-                                .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
-                                .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
-                                .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
-                                .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .a(.tmp4, .add_size), ._, ._ },
-                                .{ ._, ._, .sub, .tmp2d, .i(1), ._, ._ },
-                                .{ ._, ._b, .j, .@"1b", ._, ._, ._ },
-                                .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
+                                .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                                .{ ._, ._, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .add, .tmp1b, .i(1), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp1b, .a(.dst0, .add_len), ._, ._ },
-                                .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
+                                .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .@"64bit", null },
+                            .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{.{ .bool_vec = .qword }},
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
@@ -5043,30 +8376,30 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
                                 .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
                                 .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                                .{ .@"0:", ._, .mov, .tmp2d, .a(.src0p, .add_elem_limbs), ._, ._ },
+                                .{ .@"0:", ._, .mov, .tmp2d, .sa(.src0p, .add_elem_limbs), ._, ._ },
                                 .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
                                 .{ .@"1:", ._, .mov, .tmp4p, .memi(.src0p, .tmp0), ._, ._ },
                                 .{ ._, ._, .xor, .tmp4p, .memi(.src1p, .tmp0), ._, ._ },
                                 .{ ._, ._, .@"or", .tmp3p, .tmp4p, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .a(.tmp4, .add_size), ._, ._ },
-                                .{ ._, ._, .dec, .tmp2d, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .sa(.tmp4, .add_size), ._, ._ },
+                                .{ ._, ._, .sub, .tmp2d, .si(1), ._, ._ },
+                                .{ ._, ._b, .j, .@"1b", ._, ._, ._ },
                                 .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
                                 .{ ._, ._, .@"test", .tmp3p, .tmp3p, ._, ._ },
                                 .{ ._, .fromCondition(cc), .set, .tmp2b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp2q, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .dst0q, .tmp2q, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1b, ._, ._, ._ },
-                                .{ ._, ._, .cmp, .tmp1b, .a(.dst0, .add_len), ._, ._ },
+                                .{ ._, ._, .add, .tmp1b, .si(1), ._, ._ },
+                                .{ ._, ._, .cmp, .tmp1b, .sa(.dst0, .add_len), ._, ._ },
                                 .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
-                            .required_features = .{ .slow_incdec, null },
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
+                            .src_constraints = .{ .{ .scalar_int = .byte }, .{ .scalar_int = .byte } },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .to_mem } },
                             },
@@ -5079,8 +8412,9 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{.mem},
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                                .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                                 .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                 .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
                                 .{ .@"0:", ._, .xor, .tmp3d, .tmp3d, ._, ._ },
@@ -5089,62 +8423,23 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, .fromCondition(cc), .set, .tmp3b, ._, ._, ._ },
                                 .{ ._, ._l, .sh, .tmp3p, .tmp1b, ._, ._ },
                                 .{ ._, ._, .@"or", .tmp2p, .tmp3p, ._, ._ },
-                                .{ ._, ._, .add, .tmp1d, .i(1), ._, ._ },
-                                .{ ._, ._, .@"test", .tmp1d, .ia(-1, .none, .add_ptr_bit_size), ._, ._ },
+                                .{ ._, ._, .add, .tmp1d, .si(1), ._, ._ },
+                                .{ ._, ._, .@"test", .tmp1d, .sia(-1, .none, .add_ptr_bit_size), ._, ._ },
                                 .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
                                 .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
+                                .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
                                 .{ ._, ._, .mov, .memia(.dst0p, .tmp3, .sub_ptr_size), .tmp2p, ._, ._ },
                                 .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ .@"1:", ._, .add, .tmp0p, .i(1), ._, ._ },
+                                .{ .@"1:", ._, .add, .tmp0p, .si(1), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp1d, .ia(-1, .none, .add_ptr_bit_size), ._, ._ },
-                                .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                .{ ._, ._, .mov, .memi(.dst0p, .tmp3), .tmp2p, ._, ._ },
-                            } },
-                        }, .{
-                            .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mem, .to_mem } },
-                            },
-                            .extra_temps = .{
-                                .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u32, .kind = .{ .reg = .ecx } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                                .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
-                                .unused,
-                            },
-                            .dst_temps = .{.mem},
-                            .each = .{ .once = &.{
-                                .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
-                                .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
-                                .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ .@"0:", ._, .xor, .tmp3d, .tmp3d, ._, ._ },
-                                .{ ._, ._, .mov, .tmp4b, .memia(.src0b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, ._, .cmp, .tmp4b, .memia(.src1b, .tmp0, .add_size), ._, ._ },
-                                .{ ._, .fromCondition(cc), .set, .tmp3b, ._, ._, ._ },
-                                .{ ._, ._l, .sh, .tmp3p, .tmp1b, ._, ._ },
-                                .{ ._, ._, .@"or", .tmp2p, .tmp3p, ._, ._ },
-                                .{ ._, ._, .inc, .tmp1d, ._, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp1d, .ia(-1, .none, .add_ptr_bit_size), ._, ._ },
-                                .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
-                                .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
-                                .{ ._, ._, .mov, .memia(.dst0p, .tmp3, .sub_ptr_size), .tmp2p, ._, ._ },
-                                .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                                .{ .@"1:", ._, .inc, .tmp0p, ._, ._, ._ },
-                                .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
-                                .{ ._, ._, .@"test", .tmp1d, .ia(-1, .none, .add_ptr_bit_size), ._, ._ },
+                                .{ ._, ._, .@"test", .tmp1d, .sia(-1, .none, .add_ptr_bit_size), ._, ._ },
                                 .{ ._, ._z, .j, .@"0f", ._, ._, ._ },
                                 .{ ._, ._, .mov, .tmp3d, .tmp1d, ._, ._ },
-                                .{ ._, ._r, .sh, .tmp3d, .i(3), ._, ._ },
+                                .{ ._, ._r, .sh, .tmp3d, .si(3), ._, ._ },
                                 .{ ._, ._, .mov, .memi(.dst0p, .tmp3), .tmp2p, ._, ._ },
                             } },
                         } },
-                    }) catch |err2| switch (err2) {
+                    }) catch |err| switch (err) {
                         error.SelectFailed => return cg.fail("failed to select {s} {} {} {}", .{
                             @tagName(air_tag),
                             cg.typeOf(extra.lhs).fmt(pt),
@@ -5175,9 +8470,16 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 .cmp_lte, .cmp_lte_optimized => .lte,
                 .cmp_gte, .cmp_gte_optimized => .gte,
                 .cmp_gt, .cmp_gt_optimized => .gt,
-            }) else {
+            }) else fallback: {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
                 const scalar_ty = cg.typeOf(bin_op.lhs).scalarType(zcu);
+                if (scalar_ty.isRuntimeFloat()) break :fallback try cg.airCmp(inst, switch (air_tag) {
+                    else => unreachable,
+                    .cmp_lt, .cmp_lt_optimized => .lt,
+                    .cmp_lte, .cmp_lte_optimized => .lte,
+                    .cmp_gte, .cmp_gte_optimized => .gte,
+                    .cmp_gt, .cmp_gt_optimized => .gt,
+                });
                 const signedness = if (scalar_ty.isAbiInt(zcu))
                     scalar_ty.intInfo(zcu).signedness
                 else
@@ -5205,11 +8507,11 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
                         .patterns = &.{
                             .{ .src = .{ .imm8, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .imm8, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .imm8, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc.commute() }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0b, .src1b, ._, ._ },
                         } },
@@ -5217,12 +8519,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
                         .patterns = &.{
                             .{ .src = .{ .mem, .imm8 } },
-                            .{ .src = .{ .gpr, .imm8 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .imm8 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0b, .src1b, ._, ._ },
                         } },
@@ -5230,11 +8532,11 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
                         .patterns = &.{
                             .{ .src = .{ .imm16, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .imm16, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .imm16, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc.commute() }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0w, .src1w, ._, ._ },
                         } },
@@ -5242,12 +8544,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .src_constraints = .{ .{ .int = .word }, .{ .int = .word } },
                         .patterns = &.{
                             .{ .src = .{ .mem, .imm16 } },
-                            .{ .src = .{ .gpr, .imm16 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .imm16 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0w, .src1w, ._, ._ },
                         } },
@@ -5255,11 +8557,11 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
                         .patterns = &.{
                             .{ .src = .{ .imm32, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .imm32, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .imm32, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc.commute() }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0d, .src1d, ._, ._ },
                         } },
@@ -5267,45 +8569,50 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword } },
                         .patterns = &.{
                             .{ .src = .{ .mem, .imm32 } },
-                            .{ .src = .{ .gpr, .imm32 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .imm32 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0d, .src1d, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .@"64bit", null },
+                        .required_features = .{ .@"64bit", null, null, null },
                         .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
                         .patterns = &.{
                             .{ .src = .{ .simm32, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .simm32, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .simm32, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc.commute() }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0q, .src1q, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .@"64bit", null },
+                        .required_features = .{ .@"64bit", null, null, null },
                         .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
                         .patterns = &.{
                             .{ .src = .{ .mem, .simm32 } },
-                            .{ .src = .{ .gpr, .simm32 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .simm32 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0q, .src1q, ._, ._ },
                         } },
                     }, .{
+                        .src_constraints = .{ .any_int, .any_int },
                         .patterns = &.{
-                            .{ .src = .{ .to_mem, .to_mem } },
+                            .{ .src = .{ .to_mem, .to_mem }, .commute = switch (cc) {
+                                else => unreachable,
+                                .l, .ge, .b, .ae => .{ 0, 0 },
+                                .le, .g, .be, .a => .{ 0, 1 },
+                            } },
                         },
                         .extra_temps = .{
                             .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
@@ -5315,17 +8622,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .rc = .general_purpose }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, ._, .xor, .tmp1b, .tmp1b, ._, ._ },
-                            .{ .@"0:", ._r, .sh, .tmp1b, .i(1), ._, ._ },
+                            .{ .@"0:", ._r, .sh, .tmp1b, .si(1), ._, ._ },
                             .{ ._, ._, .mov, .tmp1p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._, .sbb, .tmp1p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._c, .set, .tmp1b, ._, ._, ._ },
-                            .{ ._, .fromCondition(cc), .set, .dst0b, ._, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .a(.tmp1, .add_size), ._, ._ },
+                            .{ ._, .fromCondition(switch (cc) {
+                                else => unreachable,
+                                .l, .ge, .b, .ae => cc,
+                                .le, .g, .be, .a => cc.commute(),
+                            }), .set, .dst0b, ._, ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .sa(.tmp1, .add_size), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                         } },
                     } },
@@ -5342,13 +8653,18 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 if (ops[1].index != res[0].index) try ops[1].die(cg);
                 try res[0].moveTo(inst, cg);
             },
-            .cmp_eq, .cmp_eq_optimized, .cmp_neq, .cmp_neq_optimized => |air_tag| if (use_old) try cg.airCmp(inst, switch (air_tag) {
+            .cmp_eq,
+            .cmp_eq_optimized,
+            .cmp_neq,
+            .cmp_neq_optimized,
+            => |air_tag| if (use_old) try cg.airCmp(inst, switch (air_tag) {
                 else => unreachable,
                 .cmp_eq, .cmp_eq_optimized => .eq,
                 .cmp_neq, .cmp_neq_optimized => .neq,
             }) else fallback: {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
-                if (ip.isOptionalType(cg.typeOf(bin_op.lhs).toIntern())) break :fallback try cg.airCmp(inst, switch (air_tag) {
+                const scalar_ty = cg.typeOf(bin_op.lhs).scalarType(zcu);
+                if (scalar_ty.isRuntimeFloat() or ip.isOptionalType(scalar_ty.toIntern())) break :fallback try cg.airCmp(inst, switch (air_tag) {
                     else => unreachable,
                     .cmp_eq, .cmp_eq_optimized => .eq,
                     .cmp_neq, .cmp_neq_optimized => .neq,
@@ -5362,14 +8678,13 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 })) {
                     else => unreachable,
                     inline .e, .ne => |cc| comptime &.{ .{
-                        .required_features = .{ .avx2, null },
-                        .src_constraints = .{ .any_int, .any_int },
+                        .required_features = .{ .avx2, null, null, null },
+                        .src_constraints = .{ .{ .int = .yword }, .{ .int = .yword } },
                         .patterns = &.{
-                            .{ .src = .{ .ymm, .mem } },
-                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .ymm, .ymm } },
+                            .{ .src = .{ .to_ymm, .mem } },
+                            .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_ymm, .to_ymm } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .extra_temps = .{
                             .{ .kind = .{ .rc = .sse } },
                             .unused,
@@ -5379,19 +8694,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .vp_, .xor, .tmp0y, .src0y, .src1y, ._ },
                             .{ ._, .vp_, .@"test", .tmp0y, .tmp0y, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
-                        .src_constraints = .{ .any_int, .any_int },
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{ .{ .int = .yword }, .{ .int = .yword } },
                         .patterns = &.{
-                            .{ .src = .{ .ymm, .mem } },
-                            .{ .src = .{ .mem, .ymm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .ymm, .ymm } },
+                            .{ .src = .{ .to_ymm, .mem } },
+                            .{ .src = .{ .mem, .to_ymm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_ymm, .to_ymm } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .extra_temps = .{
                             .{ .kind = .{ .rc = .sse } },
                             .unused,
@@ -5401,19 +8716,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .v_pd, .xor, .tmp0y, .src0y, .src1y, ._ },
                             .{ ._, .vp_, .@"test", .tmp0y, .tmp0y, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
-                        .src_constraints = .{ .any_int, .any_int },
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{ .{ .int = .xword }, .{ .int = .xword } },
                         .patterns = &.{
-                            .{ .src = .{ .xmm, .mem } },
-                            .{ .src = .{ .mem, .xmm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .xmm, .xmm } },
+                            .{ .src = .{ .to_xmm, .mem } },
+                            .{ .src = .{ .mem, .to_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_xmm, .to_xmm } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .extra_temps = .{
                             .{ .kind = .{ .rc = .sse } },
                             .unused,
@@ -5423,33 +8738,33 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .vp_, .xor, .tmp0x, .src0x, .src1x, ._ },
                             .{ ._, .vp_, .@"test", .tmp0x, .tmp0x, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse4_1, null },
-                        .src_constraints = .{ .any_int, .any_int },
+                        .required_features = .{ .sse4_1, null, null, null },
+                        .src_constraints = .{ .{ .int = .xword }, .{ .int = .xword } },
                         .patterns = &.{
-                            .{ .src = .{ .mut_xmm, .mem } },
-                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_xmm, .xmm } },
+                            .{ .src = .{ .to_mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_xmm, .to_xmm } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .p_, .xor, .src0x, .src1x, ._, ._ },
                             .{ ._, .p_, .@"test", .src0x, .src0x, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse2, null },
-                        .src_constraints = .{ .any_int, .any_int },
+                        .required_features = .{ .sse2, null, null, null },
+                        .src_constraints = .{ .{ .int = .xword }, .{ .int = .xword } },
                         .patterns = &.{
-                            .{ .src = .{ .mut_xmm, .mem } },
-                            .{ .src = .{ .mem, .mut_xmm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_xmm, .xmm } },
+                            .{ .src = .{ .to_mut_xmm, .mem } },
+                            .{ .src = .{ .mem, .to_mut_xmm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_xmm, .to_xmm } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .kind = .{ .rc = .sse } },
@@ -5459,22 +8774,22 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
                             .{ ._, .p_, .xor, .src0x, .src1x, ._, ._ },
                             .{ ._, .p_b, .cmpeq, .tmp1x, .src0x, ._, ._ },
                             .{ ._, .p_b, .movmsk, .tmp0d, .tmp1x, ._, ._ },
-                            .{ ._, ._, .xor, .tmp0d, .i(0xffff), ._, ._ },
+                            .{ ._, ._, .xor, .tmp0d, .si(0xffff), ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse2, .mmx },
-                        .src_constraints = .{ .any_int, .any_int },
+                        .required_features = .{ .sse, .mmx, null, null },
+                        .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
                         .patterns = &.{
-                            .{ .src = .{ .mut_mm, .mem } },
-                            .{ .src = .{ .mem, .mut_mm }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .mut_mm, .mm } },
+                            .{ .src = .{ .to_mut_mm, .mem } },
+                            .{ .src = .{ .mem, .to_mut_mm }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_mut_mm, .to_mm } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .kind = .{ .rc = .mmx } },
@@ -5484,26 +8799,27 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .p_, .xor, .tmp1q, .tmp1q, ._, ._ },
                             .{ ._, .p_, .xor, .src0q, .src1q, ._, ._ },
                             .{ ._, .p_b, .cmpeq, .tmp1q, .src0q, ._, ._ },
                             .{ ._, .p_b, .movmsk, .tmp0d, .tmp1q, ._, ._ },
-                            .{ ._, ._, .xor, .tmp0d, .i(0xff), ._, ._ },
+                            .{ ._, ._, .xor, .tmp0d, .si(0xff), ._, ._ },
                         } },
                     }, .{
                         .src_constraints = .{ .{ .int = .byte }, .{ .int = .byte } },
                         .patterns = &.{
                             .{ .src = .{ .mem, .imm8 } },
                             .{ .src = .{ .imm8, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .imm8 } },
-                            .{ .src = .{ .imm8, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .imm8 } },
+                            .{ .src = .{ .imm8, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0b, .src1b, ._, ._ },
                         } },
@@ -5512,14 +8828,14 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .patterns = &.{
                             .{ .src = .{ .mem, .imm16 } },
                             .{ .src = .{ .imm16, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .imm16 } },
-                            .{ .src = .{ .imm16, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .imm16 } },
+                            .{ .src = .{ .imm16, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0w, .src1w, ._, ._ },
                         } },
@@ -5528,36 +8844,68 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .patterns = &.{
                             .{ .src = .{ .mem, .imm32 } },
                             .{ .src = .{ .imm32, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .imm32 } },
-                            .{ .src = .{ .imm32, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .imm32 } },
+                            .{ .src = .{ .imm32, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0d, .src1d, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .@"64bit", null },
+                        .required_features = .{ .@"64bit", null, null, null },
                         .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword } },
                         .patterns = &.{
                             .{ .src = .{ .mem, .simm32 } },
                             .{ .src = .{ .simm32, .mem }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .simm32 } },
-                            .{ .src = .{ .simm32, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .mem } },
-                            .{ .src = .{ .mem, .gpr }, .commute = .{ 0, 1 } },
-                            .{ .src = .{ .gpr, .gpr } },
+                            .{ .src = .{ .to_gpr, .simm32 } },
+                            .{ .src = .{ .simm32, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .mem } },
+                            .{ .src = .{ .mem, .to_gpr }, .commute = .{ 0, 1 } },
+                            .{ .src = .{ .to_gpr, .to_gpr } },
                         },
-                        .clobbers = .{ .eflags = true },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .cmp, .src0q, .src1q, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx2, null },
+                        .required_features = .{ .avx2, null, null, null },
+                        .src_constraints = .{
+                            .{ .remainder_int = .{ .of = .yword, .is = .xword } },
+                            .{ .remainder_int = .{ .of = .yword, .is = .xword } },
+                        },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                            .{ ._, .vp_, .xor, .tmp1y, .tmp1y, .tmp1y, ._ },
+                            .{ .@"0:", .v_dqu, .mov, .tmp2y, .memiad(.src0y, .tmp0, .add_size, -16), ._, ._ },
+                            .{ ._, .vp_, .xor, .tmp2y, .tmp2y, .memiad(.src1y, .tmp0, .add_size, -16), ._ },
+                            .{ ._, .vp_, .@"or", .tmp1y, .tmp1y, .tmp2y, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                            .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_dqa, .mov, .tmp2x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ ._, .vp_, .xor, .tmp2x, .tmp2x, .memad(.src1x, .add_size, -16), ._ },
+                            .{ ._, .vp_, .@"or", .tmp1y, .tmp1y, .tmp2y, ._ },
+                            .{ ._, .vp_, .@"test", .tmp1y, .tmp1y, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx2, null, null, null },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -5570,18 +8918,51 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, .vp_, .xor, .tmp1y, .tmp1y, .tmp1y, ._ },
                             .{ .@"0:", .v_dqu, .mov, .tmp2y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
                             .{ ._, .vp_, .xor, .tmp2y, .tmp2y, .memia(.src1y, .tmp0, .add_size), ._ },
                             .{ ._, .vp_, .@"or", .tmp1y, .tmp1y, .tmp2y, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                            .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vp_, .@"test", .tmp1y, .tmp1y, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .src_constraints = .{
+                            .{ .remainder_int = .{ .of = .yword, .is = .xword } },
+                            .{ .remainder_int = .{ .of = .yword, .is = .xword } },
+                        },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .to_mem } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .{ .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0p, .sia(16, .src0, .sub_size), ._, ._ },
+                            .{ ._, .v_pd, .xor, .tmp1y, .tmp1y, .tmp1y, ._ },
+                            .{ .@"0:", .v_pd, .movu, .tmp2y, .memiad(.src0y, .tmp0, .add_size, -16), ._, ._ },
+                            .{ ._, .v_pd, .xor, .tmp2y, .tmp2y, .memiad(.src1y, .tmp0, .add_size, -16), ._ },
+                            .{ ._, .v_pd, .@"or", .tmp1y, .tmp1y, .tmp2y, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp2x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ ._, .v_pd, .xor, .tmp2x, .tmp2x, .memad(.src1x, .add_size, -16), ._ },
+                            .{ ._, .v_pd, .@"or", .tmp1y, .tmp1y, .tmp2y, ._ },
                             .{ ._, .vp_, .@"test", .tmp1y, .tmp1y, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
+                        .required_features = .{ .avx, null, null, null },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -5594,18 +8975,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, .v_pd, .xor, .tmp1y, .tmp1y, .tmp1y, ._ },
                             .{ .@"0:", .v_pd, .movu, .tmp2y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
                             .{ ._, .v_pd, .xor, .tmp2y, .tmp2y, .memia(.src1y, .tmp0, .add_size), ._ },
                             .{ ._, .v_pd, .@"or", .tmp1y, .tmp1y, .tmp2y, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(32), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .vp_, .@"test", .tmp1y, .tmp1y, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx, null },
+                        .required_features = .{ .avx, null, null, null },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -5618,18 +9000,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, .vp_, .xor, .tmp1x, .tmp1x, .tmp1x, ._ },
                             .{ .@"0:", .v_dqu, .mov, .tmp2x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .vp_, .xor, .tmp2x, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._ },
                             .{ ._, .vp_, .@"or", .tmp1x, .tmp1x, .tmp2x, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .vp_, .@"test", .tmp1x, .tmp1x, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse4_1, null },
+                        .required_features = .{ .sse4_1, null, null, null },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -5642,18 +9025,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
                             .{ .@"0:", ._dqu, .mov, .tmp2x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, .xor, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, .@"or", .tmp1x, .tmp2x, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .p_, .@"test", .tmp1x, .tmp1x, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse2, null },
+                        .required_features = .{ .sse2, null, null, null },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -5666,21 +9050,22 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
                             .{ .@"0:", ._dqu, .mov, .tmp2x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, .xor, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, .@"or", .tmp1x, .tmp2x, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(16), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .p_, .xor, .tmp2x, .tmp2x, ._, ._ },
                             .{ ._, .p_b, .cmpeq, .tmp1x, .tmp2x, ._, ._ },
                             .{ ._, .p_b, .movmsk, .tmp0d, .tmp1x, ._, ._ },
-                            .{ ._, ._, .cmp, .tmp0d, .i(0xffff), ._, ._ },
+                            .{ ._, ._, .cmp, .tmp0d, .si(0xffff), ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse, .mmx },
+                        .required_features = .{ .sse, .mmx, null, null },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .to_mem } },
                         },
@@ -5693,18 +9078,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, .p_, .xor, .tmp1q, .tmp1q, ._, ._ },
                             .{ .@"0:", ._q, .mov, .tmp2q, .memia(.src0q, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, .xor, .tmp2q, .memia(.src1q, .tmp0, .add_size), ._, ._ },
                             .{ ._, .p_, .@"or", .tmp1q, .tmp2q, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .i(8), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .p_, .xor, .tmp2q, .tmp2q, ._, ._ },
                             .{ ._, .p_b, .cmpeq, .tmp1q, .tmp2q, ._, ._ },
                             .{ ._, .p_b, .movmsk, .tmp0d, .tmp1q, ._, ._ },
-                            .{ ._, ._, .cmp, .tmp0d, .i(0xff), ._, ._ },
+                            .{ ._, ._, .cmp, .tmp0d, .si(0xff), ._, ._ },
                         } },
                     }, .{
                         .patterns = &.{
@@ -5719,13 +9105,14 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{.{ .cc = cc }},
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0p, .a(.src0, .sub_size), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
                             .{ ._, ._, .xor, .tmp1p, .tmp1p, ._, ._ },
                             .{ .@"0:", ._, .mov, .tmp2p, .memia(.src0p, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._, .xor, .tmp2p, .memia(.src1p, .tmp0, .add_size), ._, ._ },
                             .{ ._, ._, .@"or", .tmp1p, .tmp2p, ._, ._ },
-                            .{ ._, ._, .add, .tmp0p, .a(.tmp2, .add_size), ._, ._ },
+                            .{ ._, ._, .add, .tmp0p, .sa(.tmp2, .add_size), ._, ._ },
                             .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             .{ ._, ._, .@"test", .tmp1p, .tmp1p, ._, ._ },
                         } },
@@ -6453,11 +9840,8 @@ fn regClassForType(self: *CodeGen, ty: Type) Register.Class {
             else => .sse,
         },
         .vector => switch (ty.childType(zcu).toIntern()) {
-            .bool_type, .u1_type => .general_purpose,
-            else => if (ty.isAbiInt(zcu) and ty.intInfo(zcu).bits == 1)
-                .general_purpose
-            else
-                .sse,
+            .bool_type => .general_purpose,
+            else => .sse,
         },
         else => .general_purpose,
     };
@@ -8383,32 +11767,54 @@ fn airMulWithOverflow(self: *CodeGen, inst: Air.Inst.Index) !void {
 
                     const lhs_mcv = try self.resolveInst(bin_op.lhs);
                     const rhs_mcv = try self.resolveInst(bin_op.rhs);
-                    const mat_lhs_mcv = switch (lhs_mcv) {
-                        .load_symbol => mat_lhs_mcv: {
+                    const mat_lhs_mcv = mat_lhs_mcv: switch (lhs_mcv) {
+                        .register => |lhs_reg| switch (lhs_reg.class()) {
+                            else => lhs_mcv,
+                            .sse => {
+                                const mat_lhs_mcv: MCValue = .{
+                                    .register_pair = try self.register_manager.allocRegs(2, @splat(null), abi.RegisterClass.gp),
+                                };
+                                try self.genCopy(dst_ty, mat_lhs_mcv, lhs_mcv, .{});
+                                break :mat_lhs_mcv mat_lhs_mcv;
+                            },
+                        },
+                        .load_symbol => {
                             // TODO clean this up!
                             const addr_reg = try self.copyToTmpRegister(.usize, lhs_mcv.address());
                             break :mat_lhs_mcv MCValue{ .indirect = .{ .reg = addr_reg } };
                         },
                         else => lhs_mcv,
                     };
-                    const mat_lhs_lock = switch (mat_lhs_mcv) {
-                        .indirect => |reg_off| self.register_manager.lockReg(reg_off.reg),
-                        else => null,
+                    const mat_lhs_locks: [2]?RegisterLock = switch (mat_lhs_mcv) {
+                        .register_pair => |mat_lhs_regs| self.register_manager.lockRegs(2, mat_lhs_regs),
+                        .indirect => |reg_off| .{ self.register_manager.lockReg(reg_off.reg), null },
+                        else => @splat(null),
                     };
-                    defer if (mat_lhs_lock) |lock| self.register_manager.unlockReg(lock);
-                    const mat_rhs_mcv = switch (rhs_mcv) {
-                        .load_symbol => mat_rhs_mcv: {
+                    defer for (mat_lhs_locks) |mat_lhs_lock| if (mat_lhs_lock) |lock| self.register_manager.unlockReg(lock);
+                    const mat_rhs_mcv = mat_rhs_mcv: switch (rhs_mcv) {
+                        .register => |rhs_reg| switch (rhs_reg.class()) {
+                            else => rhs_mcv,
+                            .sse => {
+                                const mat_rhs_mcv: MCValue = .{
+                                    .register_pair = try self.register_manager.allocRegs(2, @splat(null), abi.RegisterClass.gp),
+                                };
+                                try self.genCopy(dst_ty, mat_rhs_mcv, rhs_mcv, .{});
+                                break :mat_rhs_mcv mat_rhs_mcv;
+                            },
+                        },
+                        .load_symbol => {
                             // TODO clean this up!
                             const addr_reg = try self.copyToTmpRegister(.usize, rhs_mcv.address());
                             break :mat_rhs_mcv MCValue{ .indirect = .{ .reg = addr_reg } };
                         },
                         else => rhs_mcv,
                     };
-                    const mat_rhs_lock = switch (mat_rhs_mcv) {
-                        .indirect => |reg_off| self.register_manager.lockReg(reg_off.reg),
-                        else => null,
+                    const mat_rhs_locks: [2]?RegisterLock = switch (mat_rhs_mcv) {
+                        .register_pair => |mat_rhs_regs| self.register_manager.lockRegs(2, mat_rhs_regs),
+                        .indirect => |reg_off| .{ self.register_manager.lockReg(reg_off.reg), null },
+                        else => @splat(null),
                     };
-                    defer if (mat_rhs_lock) |lock| self.register_manager.unlockReg(lock);
+                    defer for (mat_rhs_locks) |mat_rhs_lock| if (mat_rhs_lock) |lock| self.register_manager.unlockReg(lock);
 
                     if (mat_lhs_mcv.isBase()) try self.asmRegisterMemory(
                         .{ ._, .mov },
@@ -10003,7 +13409,7 @@ fn airClz(self: *CodeGen, inst: Air.Inst.Index) !void {
                 } },
             }, .u(0));
             _ = try self.asmJccReloc(.e, loop);
-            try self.asmRegisterMemory(.{ ._, .bsr }, dst_reg.to64(), .{
+            try self.asmRegisterMemory(.{ ._r, .bs }, dst_reg.to64(), .{
                 .base = .{ .frame = src_frame_addr.index },
                 .mod = .{ .rm = .{
                     .size = .qword,
@@ -10080,8 +13486,8 @@ fn airClz(self: *CodeGen, inst: Air.Inst.Index) !void {
                 defer self.register_manager.unlockReg(wide_lock);
 
                 try self.truncateRegister(src_ty, wide_reg);
-                try self.genBinOpMir(.{ ._, .bsr }, .u16, dst_mcv, .{ .register = wide_reg });
-            } else try self.genBinOpMir(.{ ._, .bsr }, src_ty, dst_mcv, mat_src_mcv);
+                try self.genBinOpMir(.{ ._r, .bs }, .u16, dst_mcv, .{ .register = wide_reg });
+            } else try self.genBinOpMir(.{ ._r, .bs }, src_ty, dst_mcv, mat_src_mcv);
 
             try self.asmCmovccRegisterRegister(
                 .z,
@@ -10103,7 +13509,7 @@ fn airClz(self: *CodeGen, inst: Air.Inst.Index) !void {
 
             try self.truncateRegister(src_ty, wide_reg);
             try self.genBinOpMir(
-                .{ ._, .bsr },
+                .{ ._r, .bs },
                 if (src_bits <= 8) .u16 else src_ty,
                 dst_mcv,
                 .{ .register = wide_reg },
@@ -10200,7 +13606,7 @@ fn airCtz(self: *CodeGen, inst: Air.Inst.Index) !void {
                 } },
             }, .u(0));
             _ = try self.asmJccReloc(.e, loop);
-            try self.asmRegisterMemory(.{ ._, .bsf }, dst_reg.to64(), .{
+            try self.asmRegisterMemory(.{ ._f, .bs }, dst_reg.to64(), .{
                 .base = .{ .frame = src_frame_addr.index },
                 .mod = .{ .rm = .{
                     .size = .qword,
@@ -10280,8 +13686,8 @@ fn airCtz(self: *CodeGen, inst: Air.Inst.Index) !void {
             defer self.register_manager.unlockReg(wide_lock);
 
             try self.truncateRegister(src_ty, wide_reg);
-            try self.genBinOpMir(.{ ._, .bsf }, wide_ty, dst_mcv, .{ .register = wide_reg });
-        } else try self.genBinOpMir(.{ ._, .bsf }, src_ty, dst_mcv, mat_src_mcv);
+            try self.genBinOpMir(.{ ._f, .bs }, wide_ty, dst_mcv, .{ .register = wide_reg });
+        } else try self.genBinOpMir(.{ ._f, .bs }, src_ty, dst_mcv, mat_src_mcv);
 
         const cmov_abi_size = @max(@as(u32, @intCast(dst_ty.abiSize(zcu))), 2);
         try self.asmCmovccRegisterRegister(
@@ -12975,7 +16381,18 @@ fn genShiftBinOp(
     const rcx_lock = self.register_manager.lockReg(.rcx);
     defer if (rcx_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const lhs_lock = switch (lhs_mcv) {
+    const mat_lhs_mcv: MCValue, const can_reuse_lhs = switch (lhs_mcv) {
+        .register => |lhs_reg| switch (lhs_reg.class()) {
+            .general_purpose => .{ lhs_mcv, true },
+            else => lhs: {
+                const mat_lhs_mcv = try self.allocTempRegOrMem(lhs_ty, true);
+                try self.genCopy(lhs_ty, mat_lhs_mcv, lhs_mcv, .{});
+                break :lhs .{ mat_lhs_mcv, false };
+            },
+        },
+        else => .{ lhs_mcv, true },
+    };
+    const lhs_lock = switch (mat_lhs_mcv) {
         .register => |reg| self.register_manager.lockReg(reg),
         else => null,
     };
@@ -12988,12 +16405,12 @@ fn genShiftBinOp(
     defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
 
     const dst_mcv: MCValue = dst: {
-        if (maybe_inst) |inst| {
+        if (can_reuse_lhs) if (maybe_inst) |inst| {
             const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-            if (self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv)) break :dst lhs_mcv;
-        }
+            if (self.reuseOperand(inst, bin_op.lhs, 0, mat_lhs_mcv)) break :dst mat_lhs_mcv;
+        };
         const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, maybe_inst, true);
-        try self.genCopy(lhs_ty, dst_mcv, lhs_mcv, .{});
+        try self.genCopy(lhs_ty, dst_mcv, mat_lhs_mcv, .{});
         break :dst dst_mcv;
     };
 
@@ -18337,12 +21754,28 @@ const MoveStrategy = union(enum) {
                 try self.asmRegister(.{ .f_, .ld }, src_reg);
                 try self.asmMemory(.{ .f_p, .st }, dst_mem);
             },
-            .insert_extract, .vex_insert_extract => |ie| try self.asmMemoryRegisterImmediate(
-                ie.extract,
-                dst_mem,
-                src_reg,
-                .u(0),
-            ),
+            .insert_extract, .vex_insert_extract => |ie| if (ie.extract[0] != .p_w or self.hasFeature(.sse4_1))
+                try self.asmMemoryRegisterImmediate(ie.extract, dst_mem, src_reg, .u(0))
+            else if (self.hasFeature(.sse2)) {
+                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                try self.asmRegisterRegisterImmediate(ie.extract, tmp_reg.to32(), src_reg.to128(), .u(0));
+                try self.asmMemoryRegister(.{ ._, .mov }, dst_mem, tmp_reg.to16());
+            } else {
+                const tmp_frame_index = try self.allocFrameIndex(.init(.{
+                    .size = 16,
+                    .alignment = .@"16",
+                }));
+                try self.asmMemoryRegister(.{ ._ps, .mova }, .{
+                    .base = .{ .frame = tmp_frame_index },
+                    .mod = .{ .rm = .{ .size = .xword } },
+                }, src_reg.to128());
+                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                try self.asmRegisterMemory(.{ ._, .mov }, tmp_reg.to16(), .{
+                    .base = .{ .frame = tmp_frame_index },
+                    .mod = .{ .rm = .{ .size = .word } },
+                });
+                try self.asmMemoryRegister(.{ ._, .mov }, dst_mem, tmp_reg.to16());
+            },
         }
     }
 };
@@ -18400,8 +21833,10 @@ fn moveStrategy(self: *CodeGen, ty: Type, class: Register.Class, aligned: bool)
                         .{ ._ss, .mov } },
                     5...8 => return .{ .move = if (self.hasFeature(.avx))
                         .{ .v_sd, .mov }
+                    else if (self.hasFeature(.sse2))
+                        .{ ._sd, .mov }
                     else
-                        .{ ._sd, .mov } },
+                        .{ ._ps, .movl } },
                     9...16 => return .{ .move = if (self.hasFeature(.avx))
                         .{ .v_pd, if (aligned) .mova else .movu }
                     else if (self.hasFeature(.sse2))
@@ -18427,8 +21862,10 @@ fn moveStrategy(self: *CodeGen, ty: Type, class: Register.Class, aligned: bool)
                     .{ ._ss, .mov } },
                 64 => return .{ .move = if (self.hasFeature(.avx))
                     .{ .v_sd, .mov }
+                else if (self.hasFeature(.sse2))
+                    .{ ._sd, .mov }
                 else
-                    .{ ._sd, .mov } },
+                    .{ ._ps, .movl } },
                 128 => return .{ .move = if (self.hasFeature(.avx))
                     .{ if (aligned) .v_dqa else .v_dqu, .mov }
                 else if (self.hasFeature(.sse2))
@@ -18623,6 +22060,30 @@ fn genCopy(self: *CodeGen, ty: Type, dst_mcv: MCValue, src_mcv: MCValue, opts: C
         }, opts),
         inline .register_pair, .register_triple, .register_quadruple => |dst_regs| {
             const src_info: ?struct { addr_reg: Register, addr_lock: RegisterLock } = switch (src_mcv) {
+                .register => |src_reg| switch (dst_regs[0].class()) {
+                    .general_purpose => switch (src_reg.class()) {
+                        else => unreachable,
+                        .sse => if (ty.abiSize(pt.zcu) <= 16) {
+                            if (self.hasFeature(.avx)) {
+                                try self.asmRegisterRegister(.{ .v_q, .mov }, dst_regs[0].to64(), src_reg.to128());
+                                try self.asmRegisterRegisterImmediate(.{ .vp_q, .extr }, dst_regs[1].to64(), src_reg.to128(), .u(1));
+                            } else if (self.hasFeature(.sse4_1)) {
+                                try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[0].to64(), src_reg.to128());
+                                try self.asmRegisterRegisterImmediate(.{ .p_q, .extr }, dst_regs[1].to64(), src_reg.to128(), .u(1));
+                            } else {
+                                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+                                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                                defer self.register_manager.unlockReg(tmp_lock);
+
+                                try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[0].to64(), src_reg.to128());
+                                try self.asmRegisterRegister(.{ ._ps, .movhl }, tmp_reg.to128(), src_reg.to128());
+                                try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[1].to64(), src_reg.to128());
+                            }
+                            return;
+                        } else unreachable,
+                    },
+                    else => unreachable,
+                },
                 .register_pair, .memory, .indirect, .load_frame => null,
                 .load_symbol, .load_direct, .load_got, .load_tlv => src: {
                     const src_addr_reg =
@@ -18863,7 +22324,39 @@ fn genSetReg(
         inline .register_pair,
         .register_triple,
         .register_quadruple,
-        => |src_regs| try self.genSetReg(dst_reg, ty, .{ .register = src_regs[0] }, opts),
+        => |src_regs| switch (dst_reg.class()) {
+            .general_purpose => switch (src_regs[0].class()) {
+                .general_purpose => try self.genSetReg(dst_reg, ty, .{ .register = src_regs[0] }, opts),
+                else => unreachable,
+            },
+            .sse => switch (src_regs[0].class()) {
+                .general_purpose => if (abi_size <= 16) {
+                    if (self.hasFeature(.avx)) {
+                        try self.asmRegisterRegister(.{ .v_q, .mov }, dst_reg.to128(), src_regs[0].to64());
+                        try self.asmRegisterRegisterRegisterImmediate(
+                            .{ .vp_q, .insr },
+                            dst_reg.to128(),
+                            dst_reg.to128(),
+                            src_regs[1].to64(),
+                            .u(1),
+                        );
+                    } else if (self.hasFeature(.sse4_1)) {
+                        try self.asmRegisterRegister(.{ ._q, .mov }, dst_reg.to128(), src_regs[0].to64());
+                        try self.asmRegisterRegisterImmediate(.{ .p_q, .insr }, dst_reg.to128(), src_regs[1].to64(), .u(1));
+                    } else {
+                        const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+                        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                        defer self.register_manager.unlockReg(tmp_lock);
+
+                        try self.asmRegisterRegister(.{ ._q, .mov }, dst_reg.to128(), src_regs[0].to64());
+                        try self.asmRegisterRegister(.{ ._q, .mov }, tmp_reg.to128(), src_regs[1].to64());
+                        try self.asmRegisterRegister(.{ ._ps, .movlh }, dst_reg.to128(), tmp_reg.to128());
+                    }
+                } else unreachable,
+                else => unreachable,
+            },
+            else => unreachable,
+        },
         .register_offset,
         .indirect,
         .load_frame,
@@ -23517,8 +27010,6 @@ fn promoteVarArg(self: *CodeGen, ty: Type) Type {
     }
 }
 
-// ====================================== rewrite starts here ======================================
-
 const Temp = struct {
     index: Air.Inst.Index,
 
@@ -24311,13 +27802,13 @@ const Select = struct {
     }
 
     const Case = struct {
-        required_features: [2]?std.Target.x86.Feature = @splat(null),
+        required_features: [4]?std.Target.x86.Feature = @splat(null),
         dst_constraints: [@intFromEnum(Select.Operand.Ref.src0) - @intFromEnum(Select.Operand.Ref.dst0)]Constraint = @splat(.any),
         src_constraints: [@intFromEnum(Select.Operand.Ref.none) - @intFromEnum(Select.Operand.Ref.src0)]Constraint = @splat(.any),
         patterns: []const Select.Pattern,
-        clobbers: struct { eflags: bool = false } = .{},
         extra_temps: [@intFromEnum(Select.Operand.Ref.dst0) - @intFromEnum(Select.Operand.Ref.tmp0)]TempSpec = @splat(.unused),
         dst_temps: [@intFromEnum(Select.Operand.Ref.src0) - @intFromEnum(Select.Operand.Ref.dst0)]TempSpec.Kind = @splat(.unused),
+        clobbers: struct { eflags: bool = false } = .{},
         each: union(enum) {
             once: []const Instruction,
         },
@@ -24327,9 +27818,32 @@ const Select = struct {
         any,
         any_bool_vec,
         any_int,
+        any_signed_int,
         any_float,
         bool_vec: Memory.Size,
+        vec: Memory.Size,
+        signed_int_vec: Memory.Size,
+        signed_int_or_full_vec: Memory.Size,
+        unsigned_int_vec: Memory.Size,
+        int_or_vec: Memory.Size,
+        exact_remainder_int_or_vec: struct { of: Memory.Size, is: Memory.Size },
         int: Memory.Size,
+        scalar_int: Memory.Size,
+        scalar_signed_int: Memory.Size,
+        scalar_unsigned_int: Memory.Size,
+        scalar_remainder_int: struct { of: Memory.Size, is: Memory.Size },
+        exact_int: u16,
+        exact_signed_int: u16,
+        exact_unsigned_int: u16,
+        signed_or_exact_int: Memory.Size,
+        unsigned_or_exact_int: Memory.Size,
+        po2_int: Memory.Size,
+        signed_po2_int: Memory.Size,
+        unsigned_po2_or_exact_int: Memory.Size,
+        remainder_int: struct { of: Memory.Size, is: Memory.Size },
+        exact_remainder_int: struct { of: Memory.Size, is: Memory.Size },
+        signed_or_exact_remainder_int: struct { of: Memory.Size, is: Memory.Size },
+        unsigned_or_exact_remainder_int: struct { of: Memory.Size, is: Memory.Size },
         signed_int: Memory.Size,
         unsigned_int: Memory.Size,
 
@@ -24338,30 +27852,183 @@ const Select = struct {
             switch (constraint) {
                 .any => return true,
                 .any_bool_vec => return ty.isVector(zcu) and ty.scalarType(zcu).toIntern() == .bool_type,
-                .any_int => {
+                .any_int => return ty.toIntern() == .bool_type or ty.isPtrAtRuntime(zcu) or ty.isAbiInt(zcu),
+                .any_signed_int => return ty.isAbiInt(zcu) and ty.intInfo(zcu).signedness == .signed,
+                .any_float => return ty.scalarType(zcu).isRuntimeFloat(),
+                .bool_vec => |size| return ty.isVector(zcu) and ty.scalarType(zcu).toIntern() == .bool_type and
+                    size.bitSize(cg.target) >= ty.vectorLen(zcu),
+                .vec => |size| return ty.isVector(zcu) and ty.scalarType(zcu).toIntern() != .bool_type and
+                    size.bitSize(cg.target) >= ty.abiSize(zcu),
+                .signed_int_vec => |size| {
+                    if (!ty.isVector(zcu) or size.bitSize(cg.target) < 8 * ty.abiSize(zcu)) return false;
                     const scalar_ty = ty.scalarType(zcu);
-                    return scalar_ty.isAbiInt(zcu) or scalar_ty.isPtrAtRuntime(zcu);
+                    return scalar_ty.isAbiInt(zcu) and scalar_ty.intInfo(zcu).signedness == .signed;
+                },
+                .signed_int_or_full_vec => |size| {
+                    if (!ty.isVector(zcu) or size.bitSize(cg.target) < 8 * ty.abiSize(zcu)) return false;
+                    const scalar_ty = ty.scalarType(zcu);
+                    if (scalar_ty.isPtrAtRuntime(zcu)) return true;
+                    if (!scalar_ty.isAbiInt(zcu)) return false;
+                    const scalar_int_info = scalar_ty.intInfo(zcu);
+                    return switch (scalar_int_info.signedness) {
+                        .signed => true,
+                        .unsigned => scalar_int_info.bits >= 8 and std.math.isPowerOfTwo(scalar_int_info.bits),
+                    };
+                },
+                .unsigned_int_vec => |size| {
+                    if (!ty.isVector(zcu) or size.bitSize(cg.target) < ty.bitSize(zcu)) return false;
+                    const scalar_ty = ty.scalarType(zcu);
+                    if (scalar_ty.isPtrAtRuntime(zcu)) return true;
+                    return scalar_ty.isAbiInt(zcu) and scalar_ty.intInfo(zcu).signedness == .unsigned;
+                },
+                .int_or_vec => |size| {
+                    if (ty.isVector(zcu)) return ty.scalarType(zcu).toIntern() != .bool_type and
+                        size.bitSize(cg.target) >= 8 * ty.abiSize(zcu);
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    return ty.isAbiInt(zcu) and size.bitSize(cg.target) >= ty.intInfo(zcu).bits;
+                },
+                .exact_remainder_int_or_vec => |of_is| {
+                    if (ty.isVector(zcu)) return ty.scalarType(zcu).toIntern() != .bool_type and
+                        of_is.is.bitSize(cg.target) == (8 * ty.abiSize(zcu) - 1) % of_is.of.bitSize(cg.target) + 1;
+                    if (ty.isPtrAtRuntime(zcu))
+                        return of_is.is.bitSize(cg.target) == (cg.target.ptrBitWidth() - 1) % of_is.of.bitSize(cg.target) + 1;
+                    if (!ty.isAbiInt(zcu)) return false;
+                    return of_is.is.bitSize(cg.target) == (ty.intInfo(zcu).bits - 1) % of_is.of.bitSize(cg.target) + 1;
                 },
-                .any_float => return ty.scalarType(zcu).isRuntimeFloat(),
-                .bool_vec => |size| return ty.isVector(zcu) and
-                    ty.scalarType(zcu).toIntern() == .bool_type and ty.vectorLen(zcu) <= size.bitSize(cg.target),
                 .int => |size| {
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    return ty.isAbiInt(zcu) and size.bitSize(cg.target) >= ty.intInfo(zcu).bits;
+                },
+                .scalar_int => |size| {
                     const scalar_ty = ty.scalarType(zcu);
-                    if (scalar_ty.isPtrAtRuntime(zcu)) return cg.target.ptrBitWidth() <= size.bitSize(cg.target);
-                    return scalar_ty.isAbiInt(zcu) and scalar_ty.intInfo(zcu).bits <= size.bitSize(cg.target);
+                    if (scalar_ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    return scalar_ty.isAbiInt(zcu) and size.bitSize(cg.target) >= scalar_ty.intInfo(zcu).bits;
                 },
-                .signed_int => |size| {
+                .scalar_signed_int => |size| {
                     const scalar_ty = ty.scalarType(zcu);
                     if (!scalar_ty.isAbiInt(zcu)) return false;
-                    const info = scalar_ty.intInfo(zcu);
-                    return info.signedness == .signed and info.bits <= size.bitSize(cg.target);
+                    const scalar_int_info = scalar_ty.intInfo(zcu);
+                    return scalar_int_info.signedness == .signed and size.bitSize(cg.target) >= scalar_int_info.bits;
                 },
-                .unsigned_int => |size| {
+                .scalar_unsigned_int => |size| {
+                    const scalar_ty = ty.scalarType(zcu);
+                    if (scalar_ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    if (!scalar_ty.isAbiInt(zcu)) return false;
+                    const scalar_int_info = scalar_ty.intInfo(zcu);
+                    return scalar_int_info.signedness == .unsigned and size.bitSize(cg.target) >= scalar_int_info.bits;
+                },
+                .scalar_remainder_int => |of_is| {
                     const scalar_ty = ty.scalarType(zcu);
-                    if (scalar_ty.isPtrAtRuntime(zcu)) return cg.target.ptrBitWidth() <= size.bitSize(cg.target);
+                    if (scalar_ty.isPtrAtRuntime(zcu))
+                        return of_is.is.bitSize(cg.target) >= (cg.target.ptrBitWidth() - 1) % of_is.of.bitSize(cg.target) + 1;
                     if (!scalar_ty.isAbiInt(zcu)) return false;
-                    const info = scalar_ty.intInfo(zcu);
-                    return info.signedness == .unsigned and info.bits <= size.bitSize(cg.target);
+                    return of_is.is.bitSize(cg.target) >= (scalar_ty.intInfo(zcu).bits - 1) % of_is.of.bitSize(cg.target) + 1;
+                },
+                .exact_int => |bit_size| {
+                    if (ty.toIntern() == .bool_type) return bit_size == 1;
+                    if (ty.isPtrAtRuntime(zcu)) return bit_size == cg.target.ptrBitWidth();
+                    return ty.isAbiInt(zcu) and bit_size == ty.intInfo(zcu).bits;
+                },
+                .exact_signed_int => |bit_size| {
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return int_info.signedness == .signed and bit_size == int_info.bits;
+                },
+                .exact_unsigned_int => |bit_size| {
+                    if (ty.toIntern() == .bool_type) return bit_size == 1;
+                    if (ty.isPtrAtRuntime(zcu)) return bit_size == cg.target.ptrBitWidth();
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return int_info.signedness == .unsigned and bit_size == int_info.bits;
+                },
+                .signed_or_exact_int => |size| {
+                    if (ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) == cg.target.ptrBitWidth();
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return switch (int_info.signedness) {
+                        .signed => size.bitSize(cg.target) >= int_info.bits,
+                        .unsigned => size.bitSize(cg.target) == int_info.bits,
+                    };
+                },
+                .unsigned_or_exact_int => |size| {
+                    if (ty.toIntern() == .bool_type or ty.isPtrAtRuntime(zcu)) return true;
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return switch (int_info.signedness) {
+                        .signed => size.bitSize(cg.target) == int_info.bits,
+                        .unsigned => size.bitSize(cg.target) >= int_info.bits,
+                    };
+                },
+                .po2_int => |size| {
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const bit_size = ty.intInfo(zcu).bits;
+                    return std.math.isPowerOfTwo(bit_size) and size.bitSize(cg.target) >= bit_size;
+                },
+                .signed_po2_int => |size| {
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return int_info.signedness == .signed and std.math.isPowerOfTwo(int_info.bits) and
+                        size.bitSize(cg.target) >= int_info.bits;
+                },
+                .unsigned_po2_or_exact_int => |size| {
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return switch (int_info.signedness) {
+                        .signed => size.bitSize(cg.target) == int_info.bits,
+                        .unsigned => std.math.isPowerOfTwo(int_info.bits) and size.bitSize(cg.target) >= int_info.bits,
+                    };
+                },
+                .remainder_int => |of_is| {
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu))
+                        return of_is.is.bitSize(cg.target) >= (cg.target.ptrBitWidth() - 1) % of_is.of.bitSize(cg.target) + 1;
+                    if (!ty.isAbiInt(zcu)) return false;
+                    return of_is.is.bitSize(cg.target) >= (ty.intInfo(zcu).bits - 1) % of_is.of.bitSize(cg.target) + 1;
+                },
+                .exact_remainder_int => |of_is| {
+                    if (ty.isPtrAtRuntime(zcu))
+                        return of_is.is.bitSize(cg.target) == (cg.target.ptrBitWidth() - 1) % of_is.of.bitSize(cg.target) + 1;
+                    if (!ty.isAbiInt(zcu)) return false;
+                    return of_is.is.bitSize(cg.target) == (ty.intInfo(zcu).bits - 1) % of_is.of.bitSize(cg.target) + 1;
+                },
+                .signed_or_exact_remainder_int => |of_is| {
+                    if (ty.isPtrAtRuntime(zcu))
+                        return of_is.is.bitSize(cg.target) == (cg.target.ptrBitWidth() - 1) % of_is.of.bitSize(cg.target) + 1;
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return switch (int_info.signedness) {
+                        .signed => of_is.is.bitSize(cg.target) >= (int_info.bits - 1) % of_is.of.bitSize(cg.target) + 1,
+                        .unsigned => of_is.is.bitSize(cg.target) == (int_info.bits - 1) % of_is.of.bitSize(cg.target) + 1,
+                    };
+                },
+                .unsigned_or_exact_remainder_int => |of_is| {
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu))
+                        return of_is.is.bitSize(cg.target) >= (cg.target.ptrBitWidth() - 1) % of_is.of.bitSize(cg.target) + 1;
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return switch (int_info.signedness) {
+                        .signed => of_is.is.bitSize(cg.target) == (int_info.bits - 1) % of_is.of.bitSize(cg.target) + 1,
+                        .unsigned => of_is.is.bitSize(cg.target) >= (int_info.bits - 1) % of_is.of.bitSize(cg.target) + 1,
+                    };
+                },
+                .signed_int => |size| {
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return int_info.signedness == .signed and size.bitSize(cg.target) >= int_info.bits;
+                },
+                .unsigned_int => |size| {
+                    if (ty.toIntern() == .bool_type) return true;
+                    if (ty.isPtrAtRuntime(zcu)) return size.bitSize(cg.target) >= cg.target.ptrBitWidth();
+                    if (!ty.isAbiInt(zcu)) return false;
+                    const int_info = ty.intInfo(zcu);
+                    return int_info.signedness == .unsigned and size.bitSize(cg.target) >= int_info.bits;
                 },
             }
         }
@@ -24379,97 +28046,107 @@ const Select = struct {
             imm32,
             simm32,
             mem,
-            mut_mem,
             to_mem,
+            mut_mem,
+            to_mut_mem,
             gpr,
+            to_gpr,
             mut_gpr,
+            to_mut_gpr,
             mm,
+            to_mm,
             mut_mm,
+            to_mut_mm,
             xmm,
+            to_xmm,
             mut_xmm,
+            to_mut_xmm,
             ymm,
+            to_ymm,
             mut_ymm,
+            to_mut_ymm,
 
             fn matches(src: Src, temp: Temp, cg: *CodeGen) bool {
-                switch (src) {
+                return switch (src) {
                     .none => unreachable,
-                    .any => return true,
-                    .imm8 => return switch (temp.tracking(cg).short) {
+                    .any => true,
+                    .imm8 => switch (temp.tracking(cg).short) {
                         .immediate => |imm| std.math.cast(u8, imm) != null,
                         else => false,
                     },
-                    .imm16 => return switch (temp.tracking(cg).short) {
+                    .imm16 => switch (temp.tracking(cg).short) {
                         .immediate => |imm| std.math.cast(u16, imm) != null,
                         else => false,
                     },
-                    .imm32 => return switch (temp.tracking(cg).short) {
+                    .imm32 => switch (temp.tracking(cg).short) {
                         .immediate => |imm| std.math.cast(u32, imm) != null,
                         else => false,
                     },
-                    .simm32 => return switch (temp.tracking(cg).short) {
+                    .simm32 => switch (temp.tracking(cg).short) {
                         .immediate => |imm| std.math.cast(i32, @as(i64, @bitCast(imm))) != null,
                         else => false,
                     },
-                    .mem => return temp.tracking(cg).short.isMemory(),
-                    .mut_mem => return temp.isMut(cg) and temp.tracking(cg).short.isMemory(),
-                    .to_mem => return true,
-                    .gpr, .mut_gpr => {
-                        const mcv = temp.tracking(cg).short;
-                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
-                        return abi_size <= 8 and switch (mcv) {
-                            .register => |reg| reg.class() == .general_purpose,
-                            .register_offset => |reg_off| reg_off.reg.class() == .general_purpose and
-                                reg_off.off == 0,
-                            .register_pair, .register_triple, .register_quadruple => false,
-                            else => true,
-                        };
+                    .mem => temp.tracking(cg).short.isMemory(),
+                    .to_mem, .to_mut_mem => true,
+                    .mut_mem => temp.isMut(cg) and temp.tracking(cg).short.isMemory(),
+                    .gpr => temp.typeOf(cg).abiSize(cg.pt.zcu) <= 8 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .general_purpose,
+                        .register_offset => |reg_off| reg_off.reg.class() == .general_purpose and reg_off.off == 0,
+                        else => false,
                     },
-                    .mm, .mut_mm => {
-                        const mcv = temp.tracking(cg).short;
-                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
-                        return abi_size <= 8 and switch (mcv) {
-                            .register => |reg| reg.class() == .mmx,
-                            .register_offset => |reg_off| reg_off.reg.class() == .mmx and
-                                reg_off.off == 0,
-                            else => false,
-                        };
+                    .mut_gpr => temp.isMut(cg) and temp.typeOf(cg).abiSize(cg.pt.zcu) <= 8 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .general_purpose,
+                        .register_offset => |reg_off| reg_off.reg.class() == .general_purpose and reg_off.off == 0,
+                        else => false,
                     },
-                    .xmm, .mut_xmm => {
-                        const mcv = temp.tracking(cg).short;
-                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
-                        return abi_size > 8 and abi_size <= 16 and switch (mcv) {
-                            .register => |reg| reg.class() == .sse,
-                            .register_offset => |reg_off| reg_off.reg.class() == .sse and
-                                reg_off.off == 0,
-                            .register_pair, .register_triple, .register_quadruple => false,
-                            else => true,
-                        };
+                    .to_gpr, .to_mut_gpr => temp.typeOf(cg).abiSize(cg.pt.zcu) <= 8,
+                    .mm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 8 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .mmx,
+                        .register_offset => |reg_off| reg_off.reg.class() == .mmx and reg_off.off == 0,
+                        else => false,
                     },
-                    .ymm, .mut_ymm => {
-                        const mcv = temp.tracking(cg).short;
-                        const abi_size = temp.typeOf(cg).abiSize(cg.pt.zcu);
-                        return abi_size > 16 and abi_size <= 32 and switch (mcv) {
-                            .register => |reg| reg.class() == .sse,
-                            .register_offset => |reg_off| reg_off.reg.class() == .sse and
-                                reg_off.off == 0,
-                            .register_pair, .register_triple, .register_quadruple => false,
-                            else => true,
-                        };
+                    .mut_mm => temp.isMut(cg) and temp.typeOf(cg).abiSize(cg.pt.zcu) == 8 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .mmx,
+                        .register_offset => |reg_off| reg_off.reg.class() == .mmx and reg_off.off == 0,
+                        else => false,
                     },
-                }
+                    .to_mm, .to_mut_mm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 8,
+                    .xmm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 16 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .sse,
+                        .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
+                        else => false,
+                    },
+                    .mut_xmm => temp.isMut(cg) and temp.typeOf(cg).abiSize(cg.pt.zcu) == 16 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .sse,
+                        .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
+                        else => false,
+                    },
+                    .to_xmm, .to_mut_xmm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 16,
+                    .ymm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 32 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .sse,
+                        .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
+                        else => false,
+                    },
+                    .mut_ymm => temp.isMut(cg) and temp.typeOf(cg).abiSize(cg.pt.zcu) == 32 and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .sse,
+                        .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
+                        else => false,
+                    },
+                    .to_ymm, .to_mut_ymm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 32,
+                };
             }
 
             fn convert(src: Src, temp: *Temp, cg: *CodeGen) !bool {
                 return switch (src) {
                     .none => unreachable,
                     .any, .imm8, .imm16, .imm32, .simm32 => false,
-                    .mem, .mut_mem, .to_mem => try temp.toBase(cg),
-                    .gpr => try temp.toRegClass(false, .general_purpose, cg),
-                    .mut_gpr => try temp.toRegClass(true, .general_purpose, cg),
-                    .mm => try temp.toRegClass(false, .mmx, cg),
-                    .mut_mm => try temp.toRegClass(true, .mmx, cg),
-                    .xmm, .ymm => try temp.toRegClass(false, .sse, cg),
-                    .mut_xmm, .mut_ymm => try temp.toRegClass(true, .sse, cg),
+                    .mem, .to_mem, .mut_mem, .to_mut_mem => try temp.toBase(cg),
+                    .gpr, .to_gpr => try temp.toRegClass(false, .general_purpose, cg),
+                    .mut_gpr, .to_mut_gpr => try temp.toRegClass(true, .general_purpose, cg),
+                    .mm, .to_mm => try temp.toRegClass(false, .mmx, cg),
+                    .mut_mm, .to_mut_mm => try temp.toRegClass(true, .mmx, cg),
+                    .xmm, .to_xmm, .ymm, .to_ymm => try temp.toRegClass(false, .sse, cg),
+                    .mut_xmm, .to_mut_xmm, .mut_ymm, .to_mut_ymm => try temp.toRegClass(true, .sse, cg),
                 };
             }
         };
@@ -24489,6 +28166,10 @@ const Select = struct {
             rc: Register.Class,
             rc_mask: struct { rc: Register.Class, info: MaskInfo },
             mem,
+            smin_mem: Select.Operand.Ref,
+            smax_mem: Select.Operand.Ref,
+            umin_mem: Select.Operand.Ref,
+            umax_mem: Select.Operand.Ref,
             ref: Select.Operand.Ref,
             ref_mask: struct { ref: Select.Operand.Ref, info: MaskInfo },
 
@@ -24501,14 +28182,81 @@ const Select = struct {
         };
 
         fn create(spec: TempSpec, s: *Select) !?Temp {
+            const cg = s.cg;
             return switch (spec.kind) {
                 .unused => null,
-                .any => try s.cg.tempAlloc(spec.type),
-                .cc => |cc| try s.cg.tempFromValue(spec.type, .{ .eflags = cc }),
-                .reg => |reg| try s.cg.tempFromValue(spec.type, .{ .register = reg }),
-                .rc => |rc| try s.cg.tempAllocReg(spec.type, regSetForRegClass(rc)),
-                .rc_mask => |rc_mask| try s.cg.tempAllocReg(spec.type, regSetForRegClass(rc_mask.rc)),
-                .mem => try s.cg.tempAllocMem(spec.type),
+                .any => try cg.tempAlloc(spec.type),
+                .cc => |cc| try cg.tempFromValue(spec.type, .{ .eflags = cc }),
+                .reg => |reg| try cg.tempFromValue(spec.type, .{ .register = reg }),
+                .rc => |rc| try cg.tempAllocReg(spec.type, regSetForRegClass(rc)),
+                .rc_mask => |rc_mask| try cg.tempAllocReg(spec.type, regSetForRegClass(rc_mask.rc)),
+                .mem => try cg.tempAllocMem(spec.type),
+                .smin_mem, .smax_mem, .umin_mem, .umax_mem => |ty_ref| {
+                    const pt = cg.pt;
+                    const zcu = pt.zcu;
+                    const ip = &zcu.intern_pool;
+                    const ty = ty_ref.deref(s).typeOf(s.cg);
+                    const vector_len, const scalar_ty: Type = switch (ip.indexToKey(ty.toIntern())) {
+                        else => .{ null, ty },
+                        .vector_type => |vector_type| .{ vector_type.len, .fromInterned(vector_type.child) },
+                    };
+                    const res_scalar_ty, const res_scalar_val: Value = res_scalar: switch (scalar_ty.toIntern()) {
+                        .bool_type => .{
+                            scalar_ty,
+                            .fromInterned(switch (spec.kind) {
+                                else => unreachable,
+                                .smin_mem, .umax_mem => .bool_true,
+                                .smax_mem, .umin_mem => .bool_false,
+                            }),
+                        },
+                        else => {
+                            const scalar_info: InternPool.Key.IntType = if (scalar_ty.isAbiInt(zcu))
+                                scalar_ty.intInfo(zcu)
+                            else
+                                .{ .signedness = .unsigned, .bits = @intCast(scalar_ty.bitSize(zcu)) };
+                            const scalar_int_ty = try pt.intType(scalar_info.signedness, scalar_info.bits);
+                            if (scalar_info.bits <= 64) {
+                                const int_val: i64 = switch (spec.kind) {
+                                    else => unreachable,
+                                    .smin_mem => std.math.minInt(i64),
+                                    .smax_mem => std.math.maxInt(i64),
+                                    .umin_mem => 0,
+                                    .umax_mem => -1,
+                                };
+                                const shift: u6 = @intCast(64 - scalar_info.bits);
+                                break :res_scalar .{ scalar_int_ty, switch (scalar_info.signedness) {
+                                    .signed => try pt.intValue_i64(scalar_int_ty, int_val >> shift),
+                                    .unsigned => try pt.intValue_u64(scalar_int_ty, @as(u64, @bitCast(int_val)) >> shift),
+                                } };
+                            }
+                            var big_int: std.math.big.int.Managed = try .init(cg.gpa);
+                            defer big_int.deinit();
+                            try big_int.setTwosCompIntLimit(switch (spec.kind) {
+                                else => unreachable,
+                                .smin_mem, .umin_mem => .min,
+                                .smax_mem, .umax_mem => .max,
+                            }, switch (spec.kind) {
+                                else => unreachable,
+                                .smin_mem, .smax_mem => .signed,
+                                .umin_mem, .umax_mem => .unsigned,
+                            }, scalar_info.bits);
+                            try big_int.truncate(&big_int, scalar_info.signedness, scalar_info.bits);
+                            break :res_scalar .{ scalar_int_ty, try pt.intValue_big(scalar_int_ty, big_int.toConst()) };
+                        },
+                    };
+                    const res_ty, const res_val: Value = if (vector_len) |len| res: {
+                        const vector_ty = try pt.vectorType(.{
+                            .len = len,
+                            .child = res_scalar_ty.toIntern(),
+                        });
+                        const vector_val = try pt.intern(.{ .aggregate = .{
+                            .ty = vector_ty.toIntern(),
+                            .storage = .{ .repeated_elem = res_scalar_val.toIntern() },
+                        } });
+                        break :res .{ vector_ty, .fromInterned(vector_val) };
+                    } else .{ res_scalar_ty, res_scalar_val };
+                    return try cg.tempFromValue(res_ty, try cg.genTypedValue(res_val));
+                },
                 .ref => |ref| ref.deref(s),
                 .ref_mask => |ref_mask| ref_mask.ref.deref(s),
             };
@@ -24541,21 +28289,51 @@ const Select = struct {
             forward_label,
             ref,
             simm,
+            uimm,
             lea,
             mem,
         };
-        const Adjust = enum {
-            none,
-            add_ptr_size,
-            sub_ptr_size,
-            add_ptr_bit_size,
-            sub_ptr_bit_size,
-            add_size,
-            sub_size,
-            add_len,
-            sub_len,
-            add_elem_limbs,
-            sub_elem_limbs,
+        const Adjust = packed struct(u8) {
+            factor: i2,
+            scale: Memory.Scale,
+            amount: enum(u4) {
+                none,
+                ptr_size,
+                ptr_bit_size,
+                size,
+                src0_size,
+                bit_size,
+                src0_bit_size,
+                len,
+                elem_limbs,
+                src0_elem_size,
+                smin,
+                smax,
+                umax,
+            },
+
+            const none: Adjust = .{ .factor = 0, .scale = .@"1", .amount = .none };
+            const sub_ptr_size: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .ptr_size };
+            const add_ptr_bit_size: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .ptr_bit_size };
+            const add_size: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .size };
+            const sub_size: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .size };
+            const add_src0_size: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .src0_size };
+            const sub_src0_size: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .src0_size };
+            const add_2_bit_size: Adjust = .{ .factor = 1, .scale = .@"2", .amount = .bit_size };
+            const add_bit_size: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .bit_size };
+            const sub_bit_size: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .bit_size };
+            const add_src0_bit_size: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .src0_bit_size };
+            const sub_src0_bit_size: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .src0_bit_size };
+            const add_8_len: Adjust = .{ .factor = 1, .scale = .@"8", .amount = .len };
+            const add_4_len: Adjust = .{ .factor = 1, .scale = .@"4", .amount = .len };
+            const add_3_len: Adjust = .{ .factor = 1, .scale = .@"3", .amount = .len };
+            const add_2_len: Adjust = .{ .factor = 1, .scale = .@"2", .amount = .len };
+            const add_len: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .len };
+            const sub_len: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .len };
+            const add_src0_elem_size: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .src0_elem_size };
+            const sub_src0_elem_size: Adjust = .{ .factor = -1, .scale = .@"1", .amount = .src0_elem_size };
+            const add_elem_limbs: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .elem_limbs };
+            const add_umax: Adjust = .{ .factor = 1, .scale = .@"1", .amount = .umax };
         };
         const Ref = enum(u4) {
             tmp0,
@@ -24741,15 +28519,24 @@ const Select = struct {
         const src1x: Select.Operand = .{ .tag = .ref, .base = .src1x };
         const src1y: Select.Operand = .{ .tag = .ref, .base = .src1y };
 
-        fn i(imm: i32) Select.Operand {
+        fn si(imm: i32) Select.Operand {
             return .{ .tag = .simm, .imm = imm };
         }
-        fn a(base: Ref.Sized, adjust: Adjust) Select.Operand {
+        fn sa(base: Ref.Sized, adjust: Adjust) Select.Operand {
             return .{ .tag = .simm, .base = base, .adjust = adjust };
         }
-        fn ia(imm: i32, base: Ref.Sized, adjust: Adjust) Select.Operand {
+        fn sia(imm: i32, base: Ref.Sized, adjust: Adjust) Select.Operand {
             return .{ .tag = .simm, .base = base, .adjust = adjust, .imm = imm };
         }
+        fn ui(imm: i32) Select.Operand {
+            return .{ .tag = .uimm, .imm = imm };
+        }
+        fn ua(base: Ref.Sized, adjust: Adjust) Select.Operand {
+            return .{ .tag = .uimm, .base = base, .adjust = adjust };
+        }
+        fn uia(imm: i32, base: Ref.Sized, adjust: Adjust) Select.Operand {
+            return .{ .tag = .uimm, .base = base, .adjust = adjust, .imm = imm };
+        }
 
         fn lea(size: Memory.Size, base: Ref) Select.Operand {
             return .{
@@ -24757,6 +28544,13 @@ const Select = struct {
                 .base = .{ .ref = base, .size = size },
             };
         }
+        fn leaa(size: Memory.Size, base: Ref, adjust: Adjust) Select.Operand {
+            return .{
+                .tag = .lea,
+                .base = .{ .ref = base, .size = size },
+                .adjust = adjust,
+            };
+        }
         fn lead(size: Memory.Size, base: Ref, disp: i32) Select.Operand {
             return .{
                 .tag = .lea,
@@ -24768,14 +28562,22 @@ const Select = struct {
             return .{
                 .tag = .lea,
                 .base = .{ .ref = base, .size = size },
-                .index_ = .{ .ref = index, .scale = .@"1" },
+                .index = .{ .ref = index, .scale = .@"1" },
+            };
+        }
+        fn leaia(size: Memory.Size, base: Ref, index: Ref, adjust: Adjust) Select.Operand {
+            return .{
+                .tag = .lea,
+                .base = .{ .ref = base, .size = size },
+                .index = .{ .ref = index, .scale = .@"1" },
+                .adjust = adjust,
             };
         }
         fn leaid(size: Memory.Size, base: Ref, index: Ref, disp: i32) Select.Operand {
             return .{
                 .tag = .lea,
                 .base = .{ .ref = base, .size = size },
-                .index_ = .{ .ref = index, .scale = .@"1" },
+                .index = .{ .ref = index, .scale = .@"1" },
                 .imm = disp,
             };
         }
@@ -24783,22 +28585,22 @@ const Select = struct {
             return .{
                 .tag = .lea,
                 .base = .{ .ref = base, .size = size },
-                .index_ = .{ .ref = index, .scale = scale },
+                .index = .{ .ref = index, .scale = scale },
             };
         }
         fn leasid(size: Memory.Size, base: Ref, scale: Memory.Scale, index: Ref, disp: i32) Select.Operand {
             return .{
                 .tag = .lea,
                 .base = .{ .ref = base, .size = size },
-                .index_ = .{ .ref = index, .scale = scale },
+                .index = .{ .ref = index, .scale = scale },
                 .imm = disp,
             };
         }
-        fn leasida(size: Memory.Size, base: Ref, scale: Memory.Scale, index: Ref, disp: i32, adjust: Adjust) Select.Operand {
+        fn leasiad(size: Memory.Size, base: Ref, scale: Memory.Scale, index: Ref, adjust: Adjust, disp: i32) Select.Operand {
             return .{
                 .tag = .lea,
                 .base = .{ .ref = base, .size = size },
-                .index_ = .{ .ref = index, .scale = scale },
+                .index = .{ .ref = index, .scale = scale },
                 .adjust = adjust,
                 .imm = disp,
             };
@@ -24817,6 +28619,21 @@ const Select = struct {
                 .imm = disp,
             };
         }
+        fn mema(base: Ref.Sized, adjust: Adjust) Select.Operand {
+            return .{
+                .tag = .mem,
+                .base = base,
+                .adjust = adjust,
+            };
+        }
+        fn memad(base: Ref.Sized, adjust: Adjust, disp: i32) Select.Operand {
+            return .{
+                .tag = .mem,
+                .base = base,
+                .adjust = adjust,
+                .imm = disp,
+            };
+        }
         fn memi(base: Ref.Sized, index: Ref) Select.Operand {
             return .{
                 .tag = .mem,
@@ -24832,6 +28649,15 @@ const Select = struct {
                 .adjust = adjust,
             };
         }
+        fn memiad(base: Ref.Sized, index: Ref, adjust: Adjust, disp: i32) Select.Operand {
+            return .{
+                .tag = .mem,
+                .base = base,
+                .index = .{ .ref = index, .scale = .@"1" },
+                .adjust = adjust,
+                .imm = disp,
+            };
+        }
         fn memid(base: Ref.Sized, index: Ref, disp: i32) Select.Operand {
             return .{
                 .tag = .mem,
@@ -24847,6 +28673,14 @@ const Select = struct {
                 .index = .{ .ref = index, .scale = scale },
             };
         }
+        fn memsia(base: Ref.Sized, scale: Memory.Scale, index: Ref, adjust: Adjust) Select.Operand {
+            return .{
+                .tag = .mem,
+                .base = base,
+                .index = .{ .ref = index, .scale = scale },
+                .adjust = adjust,
+            };
+        }
         fn memsid(base: Ref.Sized, scale: Memory.Scale, index: Ref, disp: i32) Select.Operand {
             return .{
                 .tag = .mem,
@@ -24855,7 +28689,7 @@ const Select = struct {
                 .imm = disp,
             };
         }
-        fn memsida(base: Ref.Sized, scale: Memory.Scale, index: Ref, disp: i32, adjust: Adjust) Select.Operand {
+        fn memsiad(base: Ref.Sized, scale: Memory.Scale, index: Ref, adjust: Adjust, disp: i32) Select.Operand {
             return .{
                 .tag = .mem,
                 .base = base,
@@ -24865,26 +28699,34 @@ const Select = struct {
             };
         }
 
-        fn adjustedImm(op: Select.Operand, s: *const Select) i32 {
-            return switch (op.adjust) {
-                .none => op.imm,
-                .add_ptr_size => op.imm + @divExact(s.cg.target.ptrBitWidth(), 8),
-                .sub_ptr_size => op.imm - @divExact(s.cg.target.ptrBitWidth(), 8),
-                .add_ptr_bit_size => op.imm + s.cg.target.ptrBitWidth(),
-                .sub_ptr_bit_size => op.imm - s.cg.target.ptrBitWidth(),
-                .add_size => op.imm + @as(i32, @intCast(op.base.ref.deref(s).typeOf(s.cg).abiSize(s.cg.pt.zcu))),
-                .sub_size => op.imm - @as(i32, @intCast(op.base.ref.deref(s).typeOf(s.cg).abiSize(s.cg.pt.zcu))),
-                .add_len => op.imm + @as(i32, @intCast(op.base.ref.deref(s).typeOf(s.cg).vectorLen(s.cg.pt.zcu))),
-                .sub_len => op.imm - @as(i32, @intCast(op.base.ref.deref(s).typeOf(s.cg).vectorLen(s.cg.pt.zcu))),
-                .add_elem_limbs => op.imm + @as(i32, @intCast(@divExact(
-                    op.base.ref.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).abiSize(s.cg.pt.zcu),
-                    @divExact(op.base.size.bitSize(s.cg.target), 8),
-                ))),
-                .sub_elem_limbs => op.imm - @as(i32, @intCast(@divExact(
+        fn adjustedImm(op: Select.Operand, comptime SignedImm: type, s: *const Select) SignedImm {
+            const UnsignedImm = @Type(.{
+                .int = .{ .signedness = .unsigned, .bits = @typeInfo(SignedImm).int.bits },
+            });
+            return op.imm + @as(i5, op.adjust.factor) * op.adjust.scale.toFactor() * @as(SignedImm, switch (op.adjust.amount) {
+                .none => 0,
+                .ptr_size => @divExact(s.cg.target.ptrBitWidth(), 8),
+                .ptr_bit_size => s.cg.target.ptrBitWidth(),
+                .size => @intCast(op.base.ref.deref(s).typeOf(s.cg).abiSize(s.cg.pt.zcu)),
+                .src0_size => @intCast(Select.Operand.Ref.src0.deref(s).typeOf(s.cg).abiSize(s.cg.pt.zcu)),
+                .bit_size => @intCast(op.base.ref.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).bitSize(s.cg.pt.zcu)),
+                .src0_bit_size => @intCast(Select.Operand.Ref.src0.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).bitSize(s.cg.pt.zcu)),
+                .len => @intCast(op.base.ref.deref(s).typeOf(s.cg).vectorLen(s.cg.pt.zcu)),
+                .elem_limbs => @intCast(@divExact(
                     op.base.ref.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).abiSize(s.cg.pt.zcu),
                     @divExact(op.base.size.bitSize(s.cg.target), 8),
-                ))),
-            };
+                )),
+                .src0_elem_size => @intCast(Select.Operand.Ref.src0.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).abiSize(s.cg.pt.zcu)),
+                .smin => @as(SignedImm, std.math.minInt(SignedImm)) >> @truncate(
+                    -%op.base.ref.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).bitSize(s.cg.pt.zcu),
+                ),
+                .smax => @as(SignedImm, std.math.maxInt(SignedImm)) >> @truncate(
+                    -%op.base.ref.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).bitSize(s.cg.pt.zcu),
+                ),
+                .umax => @bitCast(@as(UnsignedImm, std.math.maxInt(UnsignedImm)) >> @truncate(
+                    -%op.base.ref.deref(s).typeOf(s.cg).scalarType(s.cg.pt.zcu).bitSize(s.cg.pt.zcu),
+                )),
+            });
         }
 
         fn lower(op: Select.Operand, s: *Select) !CodeGen.Operand {
@@ -24907,7 +28749,8 @@ const Select = struct {
                     else => |mcv| .{ .mem = try mcv.mem(s.cg, .{ .size = op.base.size }) },
                     .register => |reg| .{ .reg = registerAlias(reg, @intCast(@divExact(op.base.size.bitSize(s.cg.target), 8))) },
                 },
-                .simm => .{ .imm = .s(op.adjustedImm(s)) },
+                .simm => .{ .imm = .s(op.adjustedImm(i32, s)) },
+                .uimm => .{ .imm = .u(@bitCast(op.adjustedImm(i64, s))) },
                 .lea => .{ .mem = .{
                     .base = .{ .reg = registerAlias(op.base.ref.deref(s).tracking(s.cg).short.register, @divExact(s.cg.target.ptrBitWidth(), 8)) },
                     .mod = .{ .rm = .{
@@ -24917,7 +28760,7 @@ const Select = struct {
                             .none => .none,
                         },
                         .scale = op.index.scale,
-                        .disp = op.adjustedImm(s),
+                        .disp = op.adjustedImm(i32, s),
                     } },
                 } },
                 .mem => .{ .mem = try op.base.ref.deref(s).tracking(s.cg).short.mem(s.cg, .{
@@ -24927,7 +28770,7 @@ const Select = struct {
                         .none => .none,
                     },
                     .scale = op.index.scale,
-                    .disp = op.adjustedImm(s),
+                    .disp = op.adjustedImm(i32, s),
                 }) },
             };
         }
@@ -24942,14 +28785,23 @@ fn select(
 ) !void {
     cases: for (cases) |case| {
         for (case.required_features) |required_feature| if (required_feature) |feature| if (!switch (feature) {
-            .@"64bit" => cg.target.ptrBitWidth() == 64,
+            .@"64bit" => switch (cg.target.cpu.arch) {
+                else => unreachable,
+                .x86 => false,
+                .x86_64 => true,
+            },
             .mmx => false,
             else => cg.hasFeature(feature),
         }) continue :cases;
         for (case.dst_constraints[0..dst_temps.len], dst_tys) |dst_constraint, dst_ty| if (!dst_constraint.accepts(dst_ty, cg)) continue :cases;
         for (case.src_constraints[0..src_temps.len], src_temps) |src_constraint, src_temp| if (!src_constraint.accepts(src_temp.typeOf(cg), cg)) continue :cases;
+        if (std.debug.runtime_safety) {
+            for (case.dst_constraints[dst_temps.len..]) |dst_constraint| assert(dst_constraint == .any);
+            for (case.src_constraints[src_temps.len..]) |src_constraint| assert(src_constraint == .any);
+        }
         patterns: for (case.patterns) |pattern| {
-            for (pattern.src, src_temps) |src_pattern, src_temp| if (!src_pattern.matches(src_temp, cg)) continue :patterns;
+            for (pattern.src[0..src_temps.len], src_temps) |src_pattern, src_temp| if (!src_pattern.matches(src_temp, cg)) continue :patterns;
+            if (std.debug.runtime_safety) for (pattern.src[src_temps.len..]) |src_pattern| assert(src_pattern == .none);
 
             var s: Select = .{
                 .cg = cg,
@@ -24960,9 +28812,11 @@ fn select(
             const dst_slots = s.temps[@intFromEnum(Select.Operand.Ref.dst0)..@intFromEnum(Select.Operand.Ref.src0)];
             const src_slots = s.temps[@intFromEnum(Select.Operand.Ref.src0)..@intFromEnum(Select.Operand.Ref.none)];
 
+            @memcpy(src_slots[0..src_temps.len], src_temps);
+            std.mem.swap(Temp, &src_slots[pattern.commute[0]], &src_slots[pattern.commute[1]]);
             for (tmp_slots, case.extra_temps) |*slot, spec| slot.* = try spec.create(&s) orelse continue;
 
-            while (true) for (pattern.src, src_temps) |src_pattern, *src_temp| {
+            while (true) for (pattern.src[0..src_temps.len], src_temps) |src_pattern, *src_temp| {
                 if (try src_pattern.convert(src_temp, cg)) break;
             } else break;
             @memcpy(src_slots[0..src_temps.len], src_temps);
src/arch/x86_64/Encoding.zig
@@ -64,7 +64,7 @@ pub fn findByMnemonic(
                 comptime var feature_it = std.mem.splitScalar(u8, @tagName(tag), ' ');
                 comptime var features: []const std.Target.x86.Feature = &.{};
                 inline while (comptime feature_it.next()) |feature| features = features ++ .{@field(std.Target.x86.Feature, feature)};
-                break :has_features std.Target.x86.featureSetHasAll(target.cpu.features, features[0..features.len].*);
+                break :has_features std.Target.x86.featureSetHasAll(target.cpu.features, features[0..].*);
             },
         }) continue;
 
@@ -250,7 +250,8 @@ pub const Mnemonic = enum {
     // General-purpose
     adc, add, @"and",
     bsf, bsr, bswap, bt, btc, btr, bts,
-    call, cbw, cdq, cdqe, clflush,
+    call, cbw, cdq, cdqe,
+    clac, clc, cld, clflush, cli, clts, clui,
     cmova, cmovae, cmovb, cmovbe, cmovc, cmove, cmovg, cmovge, cmovl, cmovle, cmovna,
     cmovnae, cmovnb, cmovnbe, cmovnc, cmovne, cmovng, cmovnge, cmovnl, cmovnle, cmovno,
     cmovnp, cmovns, cmovnz, cmovo, cmovp, cmovpe, cmovpo, cmovs, cmovz,
@@ -274,7 +275,9 @@ pub const Mnemonic = enum {
     rcl, rcr, ret, rol, ror, rorx,
     sal, sar, sarx, sbb,
     scas, scasb, scasd, scasq, scasw,
-    shl, shld, shlx, shr, shrd, shrx, sub, syscall,
+    shl, shld, shlx, shr, shrd, shrx,
+    stac, stc, std, sti, stui,
+    sub, syscall,
     seta, setae, setb, setbe, setc, sete, setg, setge, setl, setle, setna, setnae,
     setnb, setnbe, setnc, setne, setng, setnge, setnl, setnle, setno, setnp, setns,
     setnz, seto, setp, setpe, setpo, sets, setz,
@@ -307,7 +310,7 @@ pub const Mnemonic = enum {
     ldmxcsr,
     maxps, maxss,
     minps, minss,
-    movaps, movhlps, movlhps,
+    movaps, movhlps, movhps, movlhps, movlps,
     movmskps,
     movss, movups,
     mulps, mulss,
@@ -333,6 +336,7 @@ pub const Mnemonic = enum {
     minpd, minsd,
     movapd,
     movdqa, movdqu,
+    movhpd, movlpd,
     movmskpd,
     //movsd,
     movupd,
@@ -395,7 +399,7 @@ pub const Mnemonic = enum {
     vmovd,
     vmovddup,
     vmovdqa, vmovdqu,
-    vmovhlps, vmovlhps,
+    vmovhlps, vmovhpd, vmovhps, vmovlhps, vmovlpd, vmovlps,
     vmovmskpd, vmovmskps,
     vmovq,
     vmovsd,
@@ -823,6 +827,7 @@ pub const Feature = enum {
     avx2,
     bmi,
     bmi2,
+    cmov,
     f16c,
     fma,
     lzcnt,
@@ -830,6 +835,7 @@ pub const Feature = enum {
     pclmul,
     @"pclmul avx",
     popcnt,
+    smap,
     sse,
     sse2,
     sse3,
@@ -837,6 +843,7 @@ pub const Feature = enum {
     sse4_2,
     ssse3,
     sha,
+    uintr,
     vaes,
     vpclmulqdq,
     x87,
src/arch/x86_64/encodings.zig
@@ -132,98 +132,110 @@ pub const table = [_]Entry{
     .{ .cdq, .zo, &.{ .o32 }, &.{ 0x99 }, 0, .none,  .none },
     .{ .cqo, .zo, &.{ .o64 }, &.{ 0x99 }, 0, .long,  .none },
 
+    .{ .clac, .zo, &.{}, &.{ 0x0f, 0x01, 0xca }, 0, .none, .smap },
+
+    .{ .clc, .zo, &.{}, &.{ 0xf8 }, 0, .none, .none },
+
+    .{ .cld, .zo, &.{}, &.{ 0xfc }, 0, .none, .none },
+
     .{ .clflush, .m, &.{ .m8 }, &.{ 0x0f, 0xae }, 7, .none, .none },
 
-    .{ .cmova,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .short, .none },
-    .{ .cmova,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none,  .none },
-    .{ .cmova,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long,  .none },
-    .{ .cmovae,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .none },
-    .{ .cmovae,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none,  .none },
-    .{ .cmovae,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long,  .none },
-    .{ .cmovb,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .none },
-    .{ .cmovb,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none,  .none },
-    .{ .cmovb,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long,  .none },
-    .{ .cmovbe,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .short, .none },
-    .{ .cmovbe,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none,  .none },
-    .{ .cmovbe,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long,  .none },
-    .{ .cmovc,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .none },
-    .{ .cmovc,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none,  .none },
-    .{ .cmovc,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long,  .none },
-    .{ .cmove,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .short, .none },
-    .{ .cmove,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none,  .none },
-    .{ .cmove,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long,  .none },
-    .{ .cmovg,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .short, .none },
-    .{ .cmovg,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none,  .none },
-    .{ .cmovg,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long,  .none },
-    .{ .cmovge,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .short, .none },
-    .{ .cmovge,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none,  .none },
-    .{ .cmovge,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long,  .none },
-    .{ .cmovl,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .short, .none },
-    .{ .cmovl,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none,  .none },
-    .{ .cmovl,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long,  .none },
-    .{ .cmovle,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .short, .none },
-    .{ .cmovle,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none,  .none },
-    .{ .cmovle,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long,  .none },
-    .{ .cmovna,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .short, .none },
-    .{ .cmovna,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none,  .none },
-    .{ .cmovna,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long,  .none },
-    .{ .cmovnae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .none },
-    .{ .cmovnae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none,  .none },
-    .{ .cmovnae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long,  .none },
-    .{ .cmovnb,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .none },
-    .{ .cmovnb,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none,  .none },
-    .{ .cmovnb,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long,  .none },
-    .{ .cmovnbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .short, .none },
-    .{ .cmovnbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none,  .none },
-    .{ .cmovnbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long,  .none },
-    .{ .cmovnc,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .none },
-    .{ .cmovnc,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none,  .none },
-    .{ .cmovnc,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long,  .none },
-    .{ .cmovne,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .short, .none },
-    .{ .cmovne,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none,  .none },
-    .{ .cmovne,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long,  .none },
-    .{ .cmovng,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .short, .none },
-    .{ .cmovng,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none,  .none },
-    .{ .cmovng,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long,  .none },
-    .{ .cmovnge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .short, .none },
-    .{ .cmovnge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none,  .none },
-    .{ .cmovnge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long,  .none },
-    .{ .cmovnl,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .short, .none },
-    .{ .cmovnl,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none,  .none },
-    .{ .cmovnl,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long,  .none },
-    .{ .cmovnle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .short, .none },
-    .{ .cmovnle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none,  .none },
-    .{ .cmovnle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long,  .none },
-    .{ .cmovno,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x41 }, 0, .short, .none },
-    .{ .cmovno,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x41 }, 0, .none,  .none },
-    .{ .cmovno,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x41 }, 0, .long,  .none },
-    .{ .cmovnp,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .short, .none },
-    .{ .cmovnp,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none,  .none },
-    .{ .cmovnp,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long,  .none },
-    .{ .cmovns,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x49 }, 0, .short, .none },
-    .{ .cmovns,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x49 }, 0, .none,  .none },
-    .{ .cmovns,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x49 }, 0, .long,  .none },
-    .{ .cmovnz,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .short, .none },
-    .{ .cmovnz,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none,  .none },
-    .{ .cmovnz,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long,  .none },
-    .{ .cmovo,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x40 }, 0, .short, .none },
-    .{ .cmovo,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x40 }, 0, .none,  .none },
-    .{ .cmovo,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x40 }, 0, .long,  .none },
-    .{ .cmovp,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .short, .none },
-    .{ .cmovp,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none,  .none },
-    .{ .cmovp,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long,  .none },
-    .{ .cmovpe,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .short, .none },
-    .{ .cmovpe,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none,  .none },
-    .{ .cmovpe,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long,  .none },
-    .{ .cmovpo,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .short, .none },
-    .{ .cmovpo,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none,  .none },
-    .{ .cmovpo,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long,  .none },
-    .{ .cmovs,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x48 }, 0, .short, .none },
-    .{ .cmovs,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x48 }, 0, .none,  .none },
-    .{ .cmovs,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x48 }, 0, .long,  .none },
-    .{ .cmovz,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .short, .none },
-    .{ .cmovz,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none,  .none },
-    .{ .cmovz,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long,  .none },
+    .{ .cli, .zo, &.{}, &.{ 0xfa }, 0, .none, .none },
+
+    .{ .clts, .zo, &.{}, &.{ 0x0f, 0x06 }, 0, .none, .none },
+
+    .{ .clui, .zo, &.{}, &.{ 0xf3, 0x0f, 0x01, 0xee }, 0, .none, .uintr },
+
+    .{ .cmova,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .short, .cmov },
+    .{ .cmova,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none,  .cmov },
+    .{ .cmova,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long,  .cmov },
+    .{ .cmovae,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .cmov },
+    .{ .cmovae,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none,  .cmov },
+    .{ .cmovae,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long,  .cmov },
+    .{ .cmovb,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .cmov },
+    .{ .cmovb,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none,  .cmov },
+    .{ .cmovb,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long,  .cmov },
+    .{ .cmovbe,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .short, .cmov },
+    .{ .cmovbe,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none,  .cmov },
+    .{ .cmovbe,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long,  .cmov },
+    .{ .cmovc,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .cmov },
+    .{ .cmovc,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none,  .cmov },
+    .{ .cmovc,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long,  .cmov },
+    .{ .cmove,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .short, .cmov },
+    .{ .cmove,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none,  .cmov },
+    .{ .cmove,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long,  .cmov },
+    .{ .cmovg,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .short, .cmov },
+    .{ .cmovg,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none,  .cmov },
+    .{ .cmovg,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long,  .cmov },
+    .{ .cmovge,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .short, .cmov },
+    .{ .cmovge,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none,  .cmov },
+    .{ .cmovge,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long,  .cmov },
+    .{ .cmovl,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .short, .cmov },
+    .{ .cmovl,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none,  .cmov },
+    .{ .cmovl,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long,  .cmov },
+    .{ .cmovle,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .short, .cmov },
+    .{ .cmovle,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none,  .cmov },
+    .{ .cmovle,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long,  .cmov },
+    .{ .cmovna,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .short, .cmov },
+    .{ .cmovna,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none,  .cmov },
+    .{ .cmovna,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long,  .cmov },
+    .{ .cmovnae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .cmov },
+    .{ .cmovnae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none,  .cmov },
+    .{ .cmovnae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long,  .cmov },
+    .{ .cmovnb,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .cmov },
+    .{ .cmovnb,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none,  .cmov },
+    .{ .cmovnb,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long,  .cmov },
+    .{ .cmovnbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .short, .cmov },
+    .{ .cmovnbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none,  .cmov },
+    .{ .cmovnbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long,  .cmov },
+    .{ .cmovnc,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .cmov },
+    .{ .cmovnc,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none,  .cmov },
+    .{ .cmovnc,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long,  .cmov },
+    .{ .cmovne,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .short, .cmov },
+    .{ .cmovne,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none,  .cmov },
+    .{ .cmovne,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long,  .cmov },
+    .{ .cmovng,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .short, .cmov },
+    .{ .cmovng,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none,  .cmov },
+    .{ .cmovng,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long,  .cmov },
+    .{ .cmovnge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .short, .cmov },
+    .{ .cmovnge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none,  .cmov },
+    .{ .cmovnge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long,  .cmov },
+    .{ .cmovnl,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .short, .cmov },
+    .{ .cmovnl,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none,  .cmov },
+    .{ .cmovnl,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long,  .cmov },
+    .{ .cmovnle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .short, .cmov },
+    .{ .cmovnle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none,  .cmov },
+    .{ .cmovnle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long,  .cmov },
+    .{ .cmovno,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x41 }, 0, .short, .cmov },
+    .{ .cmovno,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x41 }, 0, .none,  .cmov },
+    .{ .cmovno,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x41 }, 0, .long,  .cmov },
+    .{ .cmovnp,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .short, .cmov },
+    .{ .cmovnp,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none,  .cmov },
+    .{ .cmovnp,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long,  .cmov },
+    .{ .cmovns,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x49 }, 0, .short, .cmov },
+    .{ .cmovns,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x49 }, 0, .none,  .cmov },
+    .{ .cmovns,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x49 }, 0, .long,  .cmov },
+    .{ .cmovnz,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .short, .cmov },
+    .{ .cmovnz,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none,  .cmov },
+    .{ .cmovnz,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long,  .cmov },
+    .{ .cmovo,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x40 }, 0, .short, .cmov },
+    .{ .cmovo,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x40 }, 0, .none,  .cmov },
+    .{ .cmovo,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x40 }, 0, .long,  .cmov },
+    .{ .cmovp,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .short, .cmov },
+    .{ .cmovp,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none,  .cmov },
+    .{ .cmovp,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long,  .cmov },
+    .{ .cmovpe,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .short, .cmov },
+    .{ .cmovpe,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none,  .cmov },
+    .{ .cmovpe,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long,  .cmov },
+    .{ .cmovpo,  .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .short, .cmov },
+    .{ .cmovpo,  .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none,  .cmov },
+    .{ .cmovpo,  .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long,  .cmov },
+    .{ .cmovs,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x48 }, 0, .short, .cmov },
+    .{ .cmovs,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x48 }, 0, .none,  .cmov },
+    .{ .cmovs,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x48 }, 0, .long,  .cmov },
+    .{ .cmovz,   .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .short, .cmov },
+    .{ .cmovz,   .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none,  .cmov },
+    .{ .cmovz,   .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long,  .cmov },
 
     .{ .cmp, .zi, &.{ .al,   .imm8   }, &.{ 0x3c }, 0, .none,  .none },
     .{ .cmp, .zi, &.{ .ax,   .imm16  }, &.{ 0x3d }, 0, .short, .none },
@@ -747,6 +759,16 @@ pub const table = [_]Entry{
     .{ .shrd, .mrc, &.{ .rm32, .r32, .cl   }, &.{ 0x0f, 0xad }, 0, .none,  .none },
     .{ .shrd, .mrc, &.{ .rm64, .r64, .cl   }, &.{ 0x0f, 0xad }, 0, .long,  .none },
 
+    .{ .stac, .zo, &.{}, &.{ 0x0f, 0x01, 0xcb }, 0, .none, .smap },
+
+    .{ .stc, .zo, &.{}, &.{ 0xf9 }, 0, .none, .none },
+
+    .{ .std, .zo, &.{}, &.{ 0xfd }, 0, .none, .none },
+
+    .{ .sti, .zo, &.{}, &.{ 0xfb }, 0, .none, .none },
+
+    .{ .stui, .zo, &.{}, &.{ 0xf3, 0x0f, 0x01, 0xef }, 0, .none, .uintr },
+
     .{ .stos,  .zo, &.{ .m8  }, &.{ 0xaa }, 0, .none,  .none },
     .{ .stos,  .zo, &.{ .m16 }, &.{ 0xab }, 0, .short, .none },
     .{ .stos,  .zo, &.{ .m32 }, &.{ 0xab }, 0, .none,  .none },
@@ -927,8 +949,14 @@ pub const table = [_]Entry{
 
     .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
 
+    .{ .movhps, .rm, &.{ .xmm, .m64 }, &.{ 0x0f, 0x16 }, 0, .none, .sse },
+    .{ .movhps, .mr, &.{ .m64, .xmm }, &.{ 0x0f, 0x17 }, 0, .none, .sse },
+
     .{ .movlhps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .none, .sse },
 
+    .{ .movlps, .rm, &.{ .xmm, .m64 }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
+    .{ .movlps, .mr, &.{ .m64, .xmm }, &.{ 0x0f, 0x13 }, 0, .none, .sse },
+
     .{ .movmskps, .rm, &.{ .r32, .xmm }, &.{ 0x0f, 0x50 }, 0, .none, .sse },
     .{ .movmskps, .rm, &.{ .r64, .xmm }, &.{ 0x0f, 0x50 }, 0, .none, .sse },
 
@@ -1037,6 +1065,12 @@ pub const table = [_]Entry{
     .{ .movdqu, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0xf3, 0x0f, 0x6f }, 0, .none, .sse2 },
     .{ .movdqu, .mr, &.{ .xmm_m128, .xmm      }, &.{ 0xf3, 0x0f, 0x7f }, 0, .none, .sse2 },
 
+    .{ .movhpd, .rm, &.{ .xmm, .m64 }, &.{ 0x66, 0x0f, 0x16 }, 0, .none, .sse2 },
+    .{ .movhpd, .mr, &.{ .m64, .xmm }, &.{ 0x66, 0x0f, 0x17 }, 0, .none, .sse2 },
+
+    .{ .movlpd, .rm, &.{ .xmm, .m64 }, &.{ 0x66, 0x0f, 0x12 }, 0, .none, .sse2 },
+    .{ .movlpd, .mr, &.{ .m64, .xmm }, &.{ 0x66, 0x0f, 0x13 }, 0, .none, .sse2 },
+
     .{ .movmskpd, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0x50 }, 0, .none, .sse2 },
     .{ .movmskpd, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0x50 }, 0, .none, .sse2 },
 
@@ -1486,8 +1520,20 @@ pub const table = [_]Entry{
 
     .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
 
+    .{ .vmovhpd, .rvm, &.{ .xmm, .xmm, .m64 }, &.{ 0x66, 0x0f, 0x16 }, 0, .vex_128_wig, .avx },
+    .{ .vmovhpd, .mr,  &.{ .m64, .xmm       }, &.{ 0x66, 0x0f, 0x17 }, 0, .vex_128_wig, .avx },
+
+    .{ .vmovhps, .rvm, &.{ .xmm, .xmm, .m64 }, &.{ 0x0f, 0x16 }, 0, .vex_128_wig, .avx },
+    .{ .vmovhps, .mr,  &.{ .m64, .xmm       }, &.{ 0x0f, 0x17 }, 0, .vex_128_wig, .avx },
+
     .{ .vmovlhps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .vex_128_wig, .avx },
 
+    .{ .vmovlpd, .rvm, &.{ .xmm, .xmm, .m64 }, &.{ 0x66, 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
+    .{ .vmovlpd, .mr,  &.{ .m64, .xmm       }, &.{ 0x66, 0x0f, 0x13 }, 0, .vex_128_wig, .avx },
+
+    .{ .vmovlps, .rvm, &.{ .xmm, .xmm, .m64 }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
+    .{ .vmovlps, .mr,  &.{ .m64, .xmm       }, &.{ 0x0f, 0x13 }, 0, .vex_128_wig, .avx },
+
     .{ .vmovq, .rm, &.{ .xmm,     .xmm_m64 }, &.{ 0xf3, 0x0f, 0x7e }, 0, .vex_128_wig, .avx },
     .{ .vmovq, .mr, &.{ .xmm_m64, .xmm     }, &.{ 0x66, 0x0f, 0xd6 }, 0, .vex_128_wig, .avx },
 
@@ -1583,14 +1629,14 @@ pub const table = [_]Entry{
     .{ .vpextrd, .mri, &.{ .rm32,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
     .{ .vpextrq, .mri, &.{ .rm64,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
 
-    .{ .vpextrw, .rmi, &.{ .r32,     .xmm, .imm8 }, &.{ 0x66, 0x0f,       0x15 }, 0, .vex_128_wig, .avx },
-    .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx },
+    .{ .vpextrw, .rmi, &.{ .r32,     .xmm, .imm8 }, &.{ 0x66, 0x0f,       0xc5 }, 0, .vex_128_w0, .avx },
+    .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_w0, .avx },
 
-    .{ .vpinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
-    .{ .vpinsrd, .rmi, &.{ .xmm, .rm32,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
-    .{ .vpinsrq, .rmi, &.{ .xmm, .rm64,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },
+    .{ .vpinsrb, .rvmi, &.{ .xmm, .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
+    .{ .vpinsrd, .rvmi, &.{ .xmm, .xmm, .rm32,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
+    .{ .vpinsrq, .rvmi, &.{ .xmm, .xmm, .rm64,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },
 
-    .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx },
+    .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_w0, .avx },
 
     .{ .vpmaxsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_128_wig, .avx },
     .{ .vpmaxsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f,       0xee }, 0, .vex_128_wig, .avx },
src/arch/x86_64/Lower.zig
@@ -418,8 +418,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                                 // Here, we currently assume local dynamic TLS vars, and so
                                 // we emit LD model.
                                 _ = lower.reloc(.{ .linker_tlsld = sym_index }, 0);
-                                lower.result_insts[lower.result_insts_len] =
-                                    try Instruction.new(.none, .lea, &[_]Operand{
+                                lower.result_insts[lower.result_insts_len] = try .new(.none, .lea, &.{
                                     .{ .reg = .rdi },
                                     .{ .mem = Memory.initRip(mem_op.sib.ptr_size, 0) },
                                 }, lower.target);
@@ -427,8 +426,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                                 _ = lower.reloc(.{
                                     .linker_extern_fn = try elf_file.getGlobalSymbol("__tls_get_addr", null),
                                 }, 0);
-                                lower.result_insts[lower.result_insts_len] =
-                                    try Instruction.new(.none, .call, &[_]Operand{
+                                lower.result_insts[lower.result_insts_len] = try .new(.none, .call, &.{
                                     .{ .imm = .s(0) },
                                 }, lower.target);
                                 lower.result_insts_len += 1;
@@ -440,8 +438,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                                 }) };
                             } else {
                                 // Since we are linking statically, we emit LE model directly.
-                                lower.result_insts[lower.result_insts_len] =
-                                    try Instruction.new(.none, .mov, &[_]Operand{
+                                lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                     .{ .reg = .rax },
                                     .{ .mem = Memory.initSib(.qword, .{ .base = .{ .reg = .fs } }) },
                                 }, lower.target);
@@ -464,8 +461,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                             .mov => {
                                 if (elf_sym.flags.is_extern_ptr) {
                                     const reg = ops[0].reg;
-                                    lower.result_insts[lower.result_insts_len] =
-                                        try Instruction.new(.none, .mov, &[_]Operand{
+                                    lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                         .{ .reg = reg.to64() },
                                         .{ .mem = Memory.initRip(.qword, 0) },
                                     }, lower.target);
@@ -496,16 +492,14 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
 
                         if (macho_sym.flags.tlv) {
                             _ = lower.reloc(.{ .linker_reloc = sym_index }, 0);
-                            lower.result_insts[lower.result_insts_len] =
-                                try Instruction.new(.none, .mov, &[_]Operand{
+                            lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                 .{ .reg = .rdi },
                                 .{ .mem = Memory.initRip(mem_op.sib.ptr_size, 0) },
-                            });
+                            }, lower.target);
                             lower.result_insts_len += 1;
-                            lower.result_insts[lower.result_insts_len] =
-                                try Instruction.new(.none, .call, &[_]Operand{
+                            lower.result_insts[lower.result_insts_len] = try .new(.none, .call, &.{
                                 .{ .mem = Memory.initSib(.qword, .{ .base = .{ .reg = .rdi } }) },
-                            });
+                            }, lower.target);
                             lower.result_insts_len += 1;
                             emit_mnemonic = .mov;
                             break :op .{ .reg = .rax };
@@ -520,11 +514,10 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                             .mov => {
                                 if (macho_sym.flags.is_extern_ptr) {
                                     const reg = ops[0].reg;
-                                    lower.result_insts[lower.result_insts_len] =
-                                        try Instruction.new(.none, .mov, &[_]Operand{
+                                    lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                         .{ .reg = reg.to64() },
                                         .{ .mem = Memory.initRip(.qword, 0) },
-                                    });
+                                    }, lower.target);
                                     lower.result_insts_len += 1;
                                     break :op .{ .mem = Memory.initSib(mem_op.sib.ptr_size, .{ .base = .{
                                         .reg = reg.to64(),
@@ -541,8 +534,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
             },
         };
     }
-    lower.result_insts[lower.result_insts_len] =
-        try Instruction.new(emit_prefix, emit_mnemonic, emit_ops, lower.target);
+    lower.result_insts[lower.result_insts_len] = try .new(emit_prefix, emit_mnemonic, emit_ops, lower.target);
     lower.result_insts_len += 1;
 }
 
src/arch/x86_64/Mir.zig
@@ -38,6 +38,11 @@ pub const Inst = struct {
         /// ___ Right Without Affecting Flags
         _rx,
 
+        /// ___ Forward
+        _f,
+        /// ___ Reverse
+        //_r,
+
         /// ___ Above
         _a,
         /// ___ Above Or Equal
@@ -47,6 +52,7 @@ pub const Inst = struct {
         /// ___ Below Or Equal
         _be,
         /// ___ Carry
+        /// ___ Carry Flag
         _c,
         /// ___ Equal
         _e,
@@ -98,6 +104,14 @@ pub const Inst = struct {
         _s,
         /// ___ Zero
         _z,
+        /// ___ Alignment Check Flag
+        _ac,
+        /// ___ Direction Flag
+        //_d,
+        /// ___ Interrupt Flag
+        _i,
+        /// ___ User Interrupt Flag
+        _ui,
 
         /// ___ Byte
         //_b,
@@ -299,9 +313,8 @@ pub const Inst = struct {
         /// Bitwise logical and of packed double-precision floating-point values
         @"and",
         /// Bit scan forward
-        bsf,
         /// Bit scan reverse
-        bsr,
+        bs,
         /// Byte swap
         bswap,
         /// Bit test
@@ -317,6 +330,10 @@ pub const Inst = struct {
         cdq,
         /// Convert doubleword to quadword
         cdqe,
+        /// Clear carry flag
+        /// Clear direction flag
+        /// Clear interrupt flag
+        cl,
         /// Flush cache line
         clflush,
         /// Conditional move
@@ -443,6 +460,11 @@ pub const Inst = struct {
         /// Subtract packed double-precision floating-point values
         /// Subtract scalar double-precision floating-point values
         sub,
+        /// Set carry flag
+        /// Set direction flag
+        /// Set interrupt flag
+        /// Store floating-point value
+        st,
         /// Store string
         sto,
         /// Syscall
@@ -478,8 +500,6 @@ pub const Inst = struct {
         ldenv,
         /// Store x87 FPU environment
         nstenv,
-        /// Store floating-point value
-        st,
         /// Store x87 FPU environment
         stenv,
 
@@ -560,8 +580,14 @@ pub const Inst = struct {
         /// Move aligned packed single-precision floating-point values
         /// Move aligned packed double-precision floating-point values
         mova,
+        /// Move high packed single-precision floating-point values
+        /// Move high packed double-precision floating-point values
+        movh,
         /// Move packed single-precision floating-point values high to low
         movhl,
+        /// Move low packed single-precision floating-point values
+        /// Move low packed double-precision floating-point values
+        movl,
         /// Move packed single-precision floating-point values low to high
         movlh,
         /// Move unaligned packed single-precision floating-point values
src/link/Elf/Atom.zig
@@ -1274,19 +1274,19 @@ const x86_64 = struct {
     fn relaxGotpcrelx(code: []u8, t: *const std.Target) !void {
         dev.check(.x86_64_backend);
         const old_inst = disassemble(code) orelse return error.RelaxFailure;
-        const inst = switch (old_inst.encoding.mnemonic) {
-            .call => try Instruction.new(old_inst.prefix, .call, &.{
+        const inst: Instruction = switch (old_inst.encoding.mnemonic) {
+            .call => try .new(old_inst.prefix, .call, &.{
                 // TODO: hack to force imm32s in the assembler
-                .{ .imm = Immediate.s(-129) },
+                .{ .imm = .s(-129) },
             }, t),
-            .jmp => try Instruction.new(old_inst.prefix, .jmp, &.{
+            .jmp => try .new(old_inst.prefix, .jmp, &.{
                 // TODO: hack to force imm32s in the assembler
-                .{ .imm = Immediate.s(-129) },
+                .{ .imm = .s(-129) },
             }, t),
             else => return error.RelaxFailure,
         };
         relocs_log.debug("    relaxing {} => {}", .{ old_inst.encoding, inst.encoding });
-        const nop = try Instruction.new(.none, .nop, &.{}, t);
+        const nop: Instruction = try .new(.none, .nop, &.{}, t);
         try encode(&.{ nop, inst }, code);
     }
 
@@ -1295,7 +1295,7 @@ const x86_64 = struct {
         const old_inst = disassemble(code) orelse return error.RelaxFailure;
         switch (old_inst.encoding.mnemonic) {
             .mov => {
-                const inst = try Instruction.new(old_inst.prefix, .lea, &old_inst.ops, t);
+                const inst: Instruction = try .new(old_inst.prefix, .lea, &old_inst.ops, t);
                 relocs_log.debug("    relaxing {} => {}", .{ old_inst.encoding, inst.encoding });
                 try encode(&.{inst}, code);
             },
@@ -1404,14 +1404,15 @@ const x86_64 = struct {
         dev.check(.x86_64_backend);
         const old_inst = disassemble(code) orelse return false;
         switch (old_inst.encoding.mnemonic) {
-            .mov => if (Instruction.new(old_inst.prefix, .mov, &.{
-                old_inst.ops[0],
-                // TODO: hack to force imm32s in the assembler
-                .{ .imm = Immediate.s(-129) },
-            }, t)) |inst| {
+            .mov => {
+                const inst = Instruction.new(old_inst.prefix, .mov, &.{
+                    old_inst.ops[0],
+                    // TODO: hack to force imm32s in the assembler
+                    .{ .imm = .s(-129) },
+                }, t) catch return false;
                 inst.encode(std.io.null_writer, .{}) catch return false;
                 return true;
-            } else |_| return false,
+            },
             else => return false,
         }
     }
@@ -1424,7 +1425,7 @@ const x86_64 = struct {
                 const inst = Instruction.new(old_inst.prefix, .mov, &.{
                     old_inst.ops[0],
                     // TODO: hack to force imm32s in the assembler
-                    .{ .imm = Immediate.s(-129) },
+                    .{ .imm = .s(-129) },
                 }, t) catch unreachable;
                 relocs_log.debug("    relaxing {} => {}", .{ old_inst.encoding, inst.encoding });
                 encode(&.{inst}, code) catch unreachable;
@@ -1438,10 +1439,10 @@ const x86_64 = struct {
         const old_inst = disassemble(code) orelse return error.RelaxFailure;
         switch (old_inst.encoding.mnemonic) {
             .lea => {
-                const inst = try Instruction.new(old_inst.prefix, .mov, &.{
+                const inst: Instruction = try .new(old_inst.prefix, .mov, &.{
                     old_inst.ops[0],
                     // TODO: hack to force imm32s in the assembler
-                    .{ .imm = Immediate.s(-129) },
+                    .{ .imm = .s(-129) },
                 }, target);
                 relocs_log.debug("    relaxing {} => {}", .{ old_inst.encoding, inst.encoding });
                 try encode(&.{inst}, code);
@@ -1781,7 +1782,7 @@ const aarch64 = struct {
                     const off: u12 = @truncate(@as(u64, @bitCast(S_ + A)));
                     aarch64_util.writeAddImmInst(off, code);
                 } else {
-                    const old_inst = Instruction{
+                    const old_inst: Instruction = .{
                         .add_subtract_immediate = mem.bytesToValue(std.meta.TagPayload(
                             Instruction,
                             Instruction.add_subtract_immediate,
@@ -1795,7 +1796,7 @@ const aarch64 = struct {
             },
 
             .TLSDESC_CALL => if (!target.flags.has_tlsdesc) {
-                const old_inst = Instruction{
+                const old_inst: Instruction = .{
                     .unconditional_branch_register = mem.bytesToValue(std.meta.TagPayload(
                         Instruction,
                         Instruction.unconditional_branch_register,
src/link/MachO/Atom.zig
@@ -640,7 +640,8 @@ fn resolveRelocInner(
     macho_file: *MachO,
     writer: anytype,
 ) ResolveError!void {
-    const cpu_arch = macho_file.getTarget().cpu.arch;
+    const t = &macho_file.base.comp.root_mod.resolved_target.result;
+    const cpu_arch = t.cpu.arch;
     const rel_offset = math.cast(usize, rel.offset - self.off) orelse return error.Overflow;
     const P = @as(i64, @intCast(self.getAddress(macho_file))) + @as(i64, @intCast(rel_offset));
     const A = rel.addend + rel.getRelocAddend(cpu_arch);
@@ -747,7 +748,7 @@ fn resolveRelocInner(
                 const S_: i64 = @intCast(sym.getTlvPtrAddress(macho_file));
                 try writer.writeInt(i32, @intCast(S_ + A - P), .little);
             } else {
-                try x86_64.relaxTlv(code[rel_offset - 3 ..]);
+                try x86_64.relaxTlv(code[rel_offset - 3 ..], t);
                 try writer.writeInt(i32, @intCast(S + A - P), .little);
             }
         },
@@ -893,11 +894,12 @@ fn resolveRelocInner(
 const x86_64 = struct {
     fn relaxGotLoad(self: Atom, code: []u8, rel: Relocation, macho_file: *MachO) ResolveError!void {
         dev.check(.x86_64_backend);
+        const t = &macho_file.base.comp.root_mod.resolved_target.result;
         const diags = &macho_file.base.comp.link_diags;
         const old_inst = disassemble(code) orelse return error.RelaxFail;
         switch (old_inst.encoding.mnemonic) {
             .mov => {
-                const inst = Instruction.new(old_inst.prefix, .lea, &old_inst.ops) catch return error.RelaxFail;
+                const inst = Instruction.new(old_inst.prefix, .lea, &old_inst.ops, t) catch return error.RelaxFail;
                 relocs_log.debug("    relaxing {} => {}", .{ old_inst.encoding, inst.encoding });
                 encode(&.{inst}, code) catch return error.RelaxFail;
             },
@@ -916,12 +918,12 @@ const x86_64 = struct {
         }
     }
 
-    fn relaxTlv(code: []u8) error{RelaxFail}!void {
+    fn relaxTlv(code: []u8, t: *const std.Target) error{RelaxFail}!void {
         dev.check(.x86_64_backend);
         const old_inst = disassemble(code) orelse return error.RelaxFail;
         switch (old_inst.encoding.mnemonic) {
             .mov => {
-                const inst = Instruction.new(old_inst.prefix, .lea, &old_inst.ops) catch return error.RelaxFail;
+                const inst = Instruction.new(old_inst.prefix, .lea, &old_inst.ops, t) catch return error.RelaxFail;
                 relocs_log.debug("    relaxing {} => {}", .{ old_inst.encoding, inst.encoding });
                 encode(&.{inst}, code) catch return error.RelaxFail;
             },
src/dev.zig
@@ -135,6 +135,7 @@ pub const Env = enum {
                 else => Env.ast_gen.supports(feature),
             },
             .@"x86_64-linux" => switch (feature) {
+                .build_command,
                 .stdio_listen,
                 .incremental,
                 .x86_64_backend,
test/behavior/x86_64/build.zig
@@ -0,0 +1,114 @@
+const std = @import("std");
+pub fn build(b: *std.Build) void {
+    const compiler_rt_lib = b.addStaticLibrary(.{
+        .name = "compiler_rt",
+        .use_llvm = false,
+        .use_lld = false,
+        .root_module = b.createModule(.{
+            .root_source_file = b.addWriteFiles().add("compiler_rt.zig", ""),
+            .target = b.resolveTargetQuery(.{ .cpu_arch = .x86_64 }),
+        }),
+    });
+    compiler_rt_lib.bundle_compiler_rt = true;
+
+    for ([_]std.Target.Query{
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.bsf_bsr_0_clobbers_result}),
+            //.cpu_features_sub = std.Target.x86.featureSet(&.{.sse}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.bsf_bsr_0_clobbers_result}),
+            .cpu_features_sub = std.Target.x86.featureSet(&.{
+                .cmov,
+                //.sse,
+            }),
+        },
+        //.{
+        //    .cpu_arch = .x86_64,
+        //    .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+        //    .cpu_features_sub = std.Target.x86.featureSet(&.{.sse}),
+        //},
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_sub = std.Target.x86.featureSet(&.{.sse2}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.sse3}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.ssse3}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.sse4_1}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.sse4_2}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v2 },
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v2 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.avx}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
+            .cpu_features_sub = std.Target.x86.featureSet(&.{.avx2}),
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
+        },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v4 },
+        },
+    }) |query| {
+        const target = b.resolveTargetQuery(query);
+        const cpu = query.serializeCpuAlloc(b.allocator) catch @panic("OOM");
+        for ([_][]const u8{
+            "math.zig",
+        }) |path| {
+            const test_mod = b.createModule(.{
+                .root_source_file = b.path(path),
+                .target = target,
+            });
+            const test_exe = b.addTest(.{
+                .name = std.fs.path.stem(path),
+                .use_llvm = false,
+                .use_lld = false,
+                .root_module = test_mod,
+            });
+            if (!std.Target.x86.featureSetHas(target.result.cpu.features, .sse2)) {
+                test_exe.bundle_compiler_rt = false;
+                test_mod.linkLibrary(compiler_rt_lib);
+            }
+            const test_run = b.addRunArtifact(test_exe);
+            b.default_step.dependOn(&test_run.step);
+            for ([_]*std.Build.Step{
+                &test_exe.step,
+                &test_run.step,
+            }) |step| step.name = b.fmt("{s} {s}", .{ step.name, cpu });
+        }
+    }
+}
test/behavior/x86_64/math.zig
@@ -1,3 +1,709 @@
+fn testUnary(comptime op: anytype) !void {
+    const testType = struct {
+        fn testType(comptime Type: type, comptime imm_arg: Type) !void {
+            const expected = op(Type, imm_arg);
+            try struct {
+                fn testOne(actual: @TypeOf(expected)) !void {
+                    if (switch (@typeInfo(@TypeOf(expected))) {
+                        else => actual != expected,
+                        .vector => @reduce(.Or, actual != expected),
+                    }) return error.Unexpected;
+                }
+                noinline fn testOps(mem_arg: Type) !void {
+                    var reg_arg = mem_arg;
+                    _ = .{&reg_arg};
+                    try testOne(op(Type, reg_arg));
+                    try testOne(op(Type, mem_arg));
+                    try testOne(op(Type, imm_arg));
+                }
+            }.testOps(imm_arg);
+        }
+    }.testType;
+
+    try testType(i0, 0);
+    try testType(u0, 0);
+
+    try testType(i1, -1);
+    try testType(i1, 0);
+    try testType(u1, 0);
+    try testType(u1, 1 << 0);
+
+    try testType(i2, -1 << 1);
+    try testType(i2, -1);
+    try testType(i2, 0);
+    try testType(u2, 0);
+    try testType(u2, 1 << 0);
+    try testType(u2, 1 << 1);
+
+    try testType(i3, -1 << 2);
+    try testType(i3, -1);
+    try testType(i3, 0);
+    try testType(u3, 0);
+    try testType(u3, 1 << 0);
+    try testType(u3, 1 << 1);
+    try testType(u3, 1 << 2);
+
+    try testType(i4, -1 << 3);
+    try testType(i4, -1);
+    try testType(i4, 0);
+    try testType(u4, 0);
+    try testType(u4, 1 << 0);
+    try testType(u4, 1 << 1);
+    try testType(u4, 1 << 2);
+    try testType(u4, 1 << 3);
+
+    try testType(i5, -1 << 4);
+    try testType(i5, -1);
+    try testType(i5, 0);
+    try testType(u5, 0);
+    try testType(u5, 1 << 0);
+    try testType(u5, 1 << 1);
+    try testType(u5, 1 << 3);
+    try testType(u5, 1 << 4);
+
+    try testType(i7, -1 << 6);
+    try testType(i7, -1);
+    try testType(i7, 0);
+    try testType(u7, 0);
+    try testType(u7, 1 << 0);
+    try testType(u7, 1 << 1);
+    try testType(u7, 1 << 5);
+    try testType(u7, 1 << 6);
+
+    try testType(i8, -1 << 7);
+    try testType(i8, -1);
+    try testType(i8, 0);
+    try testType(u8, 0);
+    try testType(u8, 1 << 0);
+    try testType(u8, 1 << 1);
+    try testType(u8, 1 << 6);
+    try testType(u8, 1 << 7);
+
+    try testType(i9, -1 << 8);
+    try testType(i9, -1);
+    try testType(i9, 0);
+    try testType(u9, 0);
+    try testType(u9, 1 << 0);
+    try testType(u9, 1 << 1);
+    try testType(u9, 1 << 7);
+    try testType(u9, 1 << 8);
+
+    try testType(i15, -1 << 14);
+    try testType(i15, -1);
+    try testType(i15, 0);
+    try testType(u15, 0);
+    try testType(u15, 1 << 0);
+    try testType(u15, 1 << 1);
+    try testType(u15, 1 << 13);
+    try testType(u15, 1 << 14);
+
+    try testType(i16, -1 << 15);
+    try testType(i16, -1);
+    try testType(i16, 0);
+    try testType(u16, 0);
+    try testType(u16, 1 << 0);
+    try testType(u16, 1 << 1);
+    try testType(u16, 1 << 14);
+    try testType(u16, 1 << 15);
+
+    try testType(i17, -1 << 16);
+    try testType(i17, -1);
+    try testType(i17, 0);
+    try testType(u17, 0);
+    try testType(u17, 1 << 0);
+    try testType(u17, 1 << 1);
+    try testType(u17, 1 << 15);
+    try testType(u17, 1 << 16);
+
+    try testType(i31, -1 << 30);
+    try testType(i31, -1);
+    try testType(i31, 0);
+    try testType(u31, 0);
+    try testType(u31, 1 << 0);
+    try testType(u31, 1 << 1);
+    try testType(u31, 1 << 29);
+    try testType(u31, 1 << 30);
+
+    try testType(i32, -1 << 31);
+    try testType(i32, -1);
+    try testType(i32, 0);
+    try testType(u32, 0);
+    try testType(u32, 1 << 0);
+    try testType(u32, 1 << 1);
+    try testType(u32, 1 << 30);
+    try testType(u32, 1 << 31);
+
+    try testType(i33, -1 << 32);
+    try testType(i33, -1);
+    try testType(i33, 0);
+    try testType(u33, 0);
+    try testType(u33, 1 << 0);
+    try testType(u33, 1 << 1);
+    try testType(u33, 1 << 31);
+    try testType(u33, 1 << 32);
+
+    try testType(i63, -1 << 62);
+    try testType(i63, -1);
+    try testType(i63, 0);
+    try testType(u63, 0);
+    try testType(u63, 1 << 0);
+    try testType(u63, 1 << 1);
+    try testType(u63, 1 << 61);
+    try testType(u63, 1 << 62);
+
+    try testType(i64, -1 << 63);
+    try testType(i64, -1);
+    try testType(i64, 0);
+    try testType(u64, 0);
+    try testType(u64, 1 << 0);
+    try testType(u64, 1 << 1);
+    try testType(u64, 1 << 62);
+    try testType(u64, 1 << 63);
+
+    try testType(i65, -1 << 64);
+    try testType(i65, -1);
+    try testType(i65, 0);
+    try testType(u65, 0);
+    try testType(u65, 1 << 0);
+    try testType(u65, 1 << 1);
+    try testType(u65, 1 << 63);
+    try testType(u65, 1 << 64);
+
+    try testType(i95, -1 << 94);
+    try testType(i95, -1);
+    try testType(i95, 0);
+    try testType(u95, 0);
+    try testType(u95, 1 << 0);
+    try testType(u95, 1 << 1);
+    try testType(u95, 1 << 93);
+    try testType(u95, 1 << 94);
+
+    try testType(i96, -1 << 95);
+    try testType(i96, -1);
+    try testType(i96, 0);
+    try testType(u96, 0);
+    try testType(u96, 1 << 0);
+    try testType(u96, 1 << 1);
+    try testType(u96, 1 << 94);
+    try testType(u96, 1 << 95);
+
+    try testType(i97, -1 << 96);
+    try testType(i97, -1);
+    try testType(i97, 0);
+    try testType(u97, 0);
+    try testType(u97, 1 << 0);
+    try testType(u97, 1 << 1);
+    try testType(u97, 1 << 95);
+    try testType(u97, 1 << 96);
+
+    try testType(i127, -1 << 126);
+    try testType(i127, -1);
+    try testType(i127, 0);
+    try testType(u127, 0);
+    try testType(u127, 1 << 0);
+    try testType(u127, 1 << 1);
+    try testType(u127, 1 << 125);
+    try testType(u127, 1 << 126);
+
+    try testType(i128, -1 << 127);
+    try testType(i128, -1);
+    try testType(i128, 0);
+    try testType(u128, 0);
+    try testType(u128, 1 << 0);
+    try testType(u128, 1 << 1);
+    try testType(u128, 1 << 126);
+    try testType(u128, 1 << 127);
+
+    try testType(i129, -1 << 128);
+    try testType(i129, -1);
+    try testType(i129, 0);
+    try testType(u129, 0);
+    try testType(u129, 1 << 0);
+    try testType(u129, 1 << 1);
+    try testType(u129, 1 << 127);
+    try testType(u129, 1 << 128);
+
+    try testType(i159, -1 << 158);
+    try testType(i159, -1);
+    try testType(i159, 0);
+    try testType(u159, 0);
+    try testType(u159, 1 << 0);
+    try testType(u159, 1 << 1);
+    try testType(u159, 1 << 157);
+    try testType(u159, 1 << 158);
+
+    try testType(i160, -1 << 159);
+    try testType(i160, -1);
+    try testType(i160, 0);
+    try testType(u160, 0);
+    try testType(u160, 1 << 0);
+    try testType(u160, 1 << 1);
+    try testType(u160, 1 << 158);
+    try testType(u160, 1 << 159);
+
+    try testType(i161, -1 << 160);
+    try testType(i161, -1);
+    try testType(i161, 0);
+    try testType(u161, 0);
+    try testType(u161, 1 << 0);
+    try testType(u161, 1 << 1);
+    try testType(u161, 1 << 159);
+    try testType(u161, 1 << 160);
+
+    try testType(i191, -1 << 190);
+    try testType(i191, -1);
+    try testType(i191, 0);
+    try testType(u191, 0);
+    try testType(u191, 1 << 0);
+    try testType(u191, 1 << 1);
+    try testType(u191, 1 << 189);
+    try testType(u191, 1 << 190);
+
+    try testType(i192, -1 << 191);
+    try testType(i192, -1);
+    try testType(i192, 0);
+    try testType(u192, 0);
+    try testType(u192, 1 << 0);
+    try testType(u192, 1 << 1);
+    try testType(u192, 1 << 190);
+    try testType(u192, 1 << 191);
+
+    try testType(i193, -1 << 192);
+    try testType(i193, -1);
+    try testType(i193, 0);
+    try testType(u193, 0);
+    try testType(u193, 1 << 0);
+    try testType(u193, 1 << 1);
+    try testType(u193, 1 << 191);
+    try testType(u193, 1 << 192);
+
+    try testType(i223, -1 << 222);
+    try testType(i223, -1);
+    try testType(i223, 0);
+    try testType(u223, 0);
+    try testType(u223, 1 << 0);
+    try testType(u223, 1 << 1);
+    try testType(u223, 1 << 221);
+    try testType(u223, 1 << 222);
+
+    try testType(i224, -1 << 223);
+    try testType(i224, -1);
+    try testType(i224, 0);
+    try testType(u224, 0);
+    try testType(u224, 1 << 0);
+    try testType(u224, 1 << 1);
+    try testType(u224, 1 << 222);
+    try testType(u224, 1 << 223);
+
+    try testType(i225, -1 << 224);
+    try testType(i225, -1);
+    try testType(i225, 0);
+    try testType(u225, 0);
+    try testType(u225, 1 << 0);
+    try testType(u225, 1 << 1);
+    try testType(u225, 1 << 223);
+    try testType(u225, 1 << 224);
+
+    try testType(i255, -1 << 254);
+    try testType(i255, -1);
+    try testType(i255, 0);
+    try testType(u255, 0);
+    try testType(u255, 1 << 0);
+    try testType(u255, 1 << 1);
+    try testType(u255, 1 << 253);
+    try testType(u255, 1 << 254);
+
+    try testType(i256, -1 << 255);
+    try testType(i256, -1);
+    try testType(i256, 0);
+    try testType(u256, 0);
+    try testType(u256, 1 << 0);
+    try testType(u256, 1 << 1);
+    try testType(u256, 1 << 254);
+    try testType(u256, 1 << 255);
+
+    try testType(i257, -1 << 256);
+    try testType(i257, -1);
+    try testType(i257, 0);
+    try testType(u257, 0);
+    try testType(u257, 1 << 0);
+    try testType(u257, 1 << 1);
+    try testType(u257, 1 << 255);
+    try testType(u257, 1 << 256);
+
+    try testType(i511, -1 << 510);
+    try testType(i511, -1);
+    try testType(i511, 0);
+    try testType(u511, 0);
+    try testType(u511, 1 << 0);
+    try testType(u511, 1 << 1);
+    try testType(u511, 1 << 509);
+    try testType(u511, 1 << 510);
+
+    try testType(i512, -1 << 511);
+    try testType(i512, -1);
+    try testType(i512, 0);
+    try testType(u512, 0);
+    try testType(u512, 1 << 0);
+    try testType(u512, 1 << 1);
+    try testType(u512, 1 << 510);
+    try testType(u512, 1 << 511);
+
+    try testType(i513, -1 << 512);
+    try testType(i513, -1);
+    try testType(i513, 0);
+    try testType(u513, 0);
+    try testType(u513, 1 << 0);
+    try testType(u513, 1 << 1);
+    try testType(u513, 1 << 511);
+    try testType(u513, 1 << 512);
+
+    try testType(i1023, -1 << 1022);
+    try testType(i1023, -1);
+    try testType(i1023, 0);
+    try testType(u1023, 0);
+    try testType(u1023, 1 << 0);
+    try testType(u1023, 1 << 1);
+    try testType(u1023, 1 << 1021);
+    try testType(u1023, 1 << 1022);
+
+    try testType(i1024, -1 << 1023);
+    try testType(i1024, -1);
+    try testType(i1024, 0);
+    try testType(u1024, 0);
+    try testType(u1024, 1 << 0);
+    try testType(u1024, 1 << 1);
+    try testType(u1024, 1 << 1022);
+    try testType(u1024, 1 << 1023);
+
+    try testType(i1025, -1 << 1024);
+    try testType(i1025, -1);
+    try testType(i1025, 0);
+    try testType(u1025, 0);
+    try testType(u1025, 1 << 0);
+    try testType(u1025, 1 << 1);
+    try testType(u1025, 1 << 1023);
+    try testType(u1025, 1 << 1024);
+
+    try testType(@Vector(3, i0), .{ 0 << 0, 0, 0 });
+    try testType(@Vector(3, u0), .{ 0, 0, 0 << 0 });
+
+    try testType(@Vector(3, i1), .{ -1 << 0, -1, 0 });
+    try testType(@Vector(3, u1), .{ 0, 1, 1 << 0 });
+
+    try testType(@Vector(3, i2), .{ -1 << 1, -1, 0 });
+    try testType(@Vector(3, u2), .{ 0, 1, 1 << 1 });
+
+    try testType(@Vector(3, i3), .{ -1 << 2, -1, 0 });
+    try testType(@Vector(3, u3), .{ 0, 1, 1 << 2 });
+
+    try testType(@Vector(3, i4), .{ -1 << 3, -1, 0 });
+    try testType(@Vector(3, u4), .{ 0, 1, 1 << 3 });
+    try testType(@Vector(1, u4), .{
+        0xb,
+    });
+    try testType(@Vector(2, u4), .{
+        0x3, 0x4,
+    });
+    try testType(@Vector(4, u4), .{
+        0x9, 0x2, 0xf, 0xe,
+    });
+    try testType(@Vector(8, u4), .{
+        0x8, 0x1, 0xb, 0x1, 0xf, 0x5, 0x9, 0x6,
+    });
+    try testType(@Vector(16, u4), .{
+        0xb, 0x6, 0x0, 0x7, 0x8, 0x5, 0x6, 0x9, 0xe, 0xb, 0x3, 0xa, 0xb, 0x5, 0x8, 0xc,
+    });
+    try testType(@Vector(32, u4), .{
+        0xe, 0x6, 0xe, 0xa, 0xb, 0x4, 0xa, 0xb, 0x1, 0x3, 0xb, 0xc, 0x0, 0xb, 0x9, 0x4, 0xd, 0xa, 0xd, 0xd, 0x4, 0x8, 0x8, 0x6, 0xb, 0xe, 0x9, 0x6, 0xc, 0xd, 0x5, 0xd,
+    });
+    try testType(@Vector(64, u4), .{
+        0x1, 0xc, 0xe, 0x9, 0x9, 0xf, 0x3, 0xf, 0x9, 0x9, 0x5, 0x3, 0xb, 0xd, 0xd, 0xf, 0x1, 0x2, 0xf, 0x9, 0x4, 0x4, 0x8, 0x9, 0x2, 0x9, 0x8, 0xe, 0x8, 0xa, 0x4, 0x3,
+        0x4, 0xc, 0xb, 0x6, 0x4, 0x0, 0xa, 0x5, 0x1, 0xa, 0x4, 0xe, 0xa, 0x7, 0xd, 0x0, 0x4, 0xe, 0xe, 0x7, 0x7, 0xa, 0x4, 0x5, 0x6, 0xc, 0x6, 0x2, 0x6, 0xa, 0xe, 0xa,
+    });
+    try testType(@Vector(128, u4), .{
+        0xd, 0x5, 0x6, 0xe, 0x3, 0x3, 0x3, 0xe, 0xd, 0xd, 0x9, 0x0, 0x0, 0xe, 0xa, 0x9, 0x8, 0x7, 0xb, 0x5, 0x7, 0xf, 0xb, 0x8, 0x0, 0xf, 0xb, 0x3, 0xa, 0x2, 0xb, 0xc,
+        0x1, 0x1, 0xc, 0x8, 0x8, 0x6, 0x9, 0x1, 0xb, 0x0, 0x2, 0xb, 0x2, 0x2, 0x7, 0x6, 0x1, 0x1, 0xb, 0x4, 0x6, 0x4, 0x7, 0xc, 0xd, 0xc, 0xa, 0x8, 0x1, 0x7, 0x8, 0xa,
+        0x9, 0xa, 0x1, 0x8, 0x1, 0x7, 0x9, 0x4, 0x5, 0x9, 0xd, 0x0, 0xa, 0xf, 0x3, 0x3, 0x9, 0x2, 0xf, 0x5, 0xb, 0x8, 0x6, 0xb, 0xf, 0x5, 0x8, 0x3, 0x9, 0xf, 0x6, 0x8,
+        0xc, 0x8, 0x3, 0x4, 0xa, 0xe, 0xc, 0x1, 0xe, 0x9, 0x1, 0x8, 0xf, 0x6, 0xc, 0xc, 0x6, 0xf, 0x6, 0xd, 0xb, 0x9, 0xc, 0x3, 0xd, 0xa, 0x6, 0x8, 0x4, 0xa, 0x6, 0x9,
+    });
+    try testType(@Vector(256, u4), .{
+        0x6, 0xc, 0xe, 0x3, 0x8, 0x2, 0xb, 0xd, 0x3, 0xa, 0x3, 0x8, 0xb, 0x8, 0x3, 0x0, 0xb, 0x5, 0x1, 0x3, 0x2, 0x2, 0xf, 0xc, 0x5, 0x1, 0x3, 0xb, 0x1, 0xc, 0x2, 0xd,
+        0xa, 0x8, 0x1, 0xc, 0xb, 0xa, 0x3, 0x1, 0xe, 0x4, 0xf, 0xb, 0xd, 0x8, 0xf, 0xa, 0xc, 0xb, 0xb, 0x0, 0xa, 0xc, 0xf, 0xe, 0x8, 0xd, 0x9, 0x3, 0xa, 0xe, 0x8, 0x7,
+        0x5, 0xa, 0x0, 0xe, 0x0, 0xd, 0x2, 0x2, 0x9, 0x4, 0x8, 0x9, 0x0, 0x4, 0x4, 0x8, 0xe, 0x1, 0xf, 0x1, 0x9, 0x3, 0xf, 0xc, 0xa, 0x0, 0x3, 0x2, 0x4, 0x1, 0x2, 0x3,
+        0xf, 0x2, 0x7, 0xb, 0x5, 0x0, 0xd, 0x3, 0x4, 0xf, 0xa, 0x3, 0xc, 0x2, 0x5, 0xe, 0x7, 0x5, 0xd, 0x7, 0x9, 0x0, 0xd, 0x7, 0x9, 0xd, 0x5, 0x7, 0xf, 0xd, 0xb, 0x4,
+        0x9, 0x6, 0xf, 0xb, 0x1, 0xb, 0x6, 0xb, 0xf, 0x7, 0xf, 0x0, 0x4, 0x7, 0x5, 0xa, 0x8, 0x1, 0xf, 0x9, 0x9, 0x0, 0x6, 0xb, 0x1, 0x2, 0x4, 0x3, 0x2, 0x0, 0x7, 0x0,
+        0x6, 0x7, 0xf, 0x1, 0xe, 0xa, 0x8, 0x2, 0x9, 0xc, 0x1, 0x5, 0x7, 0x1, 0xb, 0x0, 0x1, 0x3, 0xd, 0x3, 0x0, 0x1, 0xa, 0x0, 0x3, 0x7, 0x1, 0x2, 0xb, 0xc, 0x2, 0x9,
+        0x8, 0x8, 0x7, 0x0, 0xd, 0x5, 0x1, 0x5, 0x7, 0x7, 0x2, 0x3, 0x8, 0x7, 0xc, 0x8, 0xf, 0xa, 0xf, 0xf, 0x3, 0x2, 0x0, 0x4, 0x7, 0x5, 0x6, 0xd, 0x6, 0x3, 0xa, 0x4,
+        0x1, 0x1, 0x2, 0xc, 0x3, 0xe, 0x2, 0xc, 0x7, 0x6, 0xe, 0xf, 0xb, 0x8, 0x6, 0x6, 0x9, 0x0, 0x4, 0xb, 0xe, 0x4, 0x2, 0x7, 0xf, 0xc, 0x0, 0x6, 0xd, 0xa, 0xe, 0xc,
+    });
+
+    try testType(@Vector(3, i5), .{ -1 << 4, -1, 0 });
+    try testType(@Vector(3, u5), .{ 0, 1, 1 << 4 });
+
+    try testType(@Vector(3, i7), .{ -1 << 6, -1, 0 });
+    try testType(@Vector(3, u7), .{ 0, 1, 1 << 6 });
+
+    try testType(@Vector(3, i8), .{ -1 << 7, -1, 0 });
+    try testType(@Vector(3, u8), .{ 0, 1, 1 << 7 });
+    try testType(@Vector(1, u8), .{
+        0x33,
+    });
+    try testType(@Vector(2, u8), .{
+        0x66, 0x87,
+    });
+    try testType(@Vector(4, u8), .{
+        0x9d, 0xcb, 0x30, 0x7b,
+    });
+    try testType(@Vector(8, u8), .{
+        0x4b, 0x35, 0x3f, 0x5c, 0xa5, 0x91, 0x23, 0x6d,
+    });
+    try testType(@Vector(16, u8), .{
+        0xb7, 0x57, 0x27, 0x29, 0x58, 0xf8, 0xc9, 0x6c, 0xbe, 0x41, 0xf4, 0xd7, 0x4d, 0x01, 0xf0, 0x37,
+    });
+    try testType(@Vector(32, u8), .{
+        0x5f, 0x61, 0x34, 0xe8, 0x37, 0x12, 0xba, 0x5a, 0x85, 0xf3, 0x3e, 0xa2, 0x0f, 0xd0, 0x65, 0xae,
+        0xed, 0xf5, 0xe8, 0x65, 0x61, 0x28, 0x4a, 0x27, 0x2e, 0x01, 0x40, 0x8c, 0xe3, 0x36, 0x5d, 0xb6,
+    });
+    try testType(@Vector(64, u8), .{
+        0xb0, 0x19, 0x5c, 0xc2, 0x3b, 0x16, 0x70, 0xad, 0x26, 0x45, 0xf2, 0xe1, 0x4f, 0x0f, 0x01, 0x72,
+        0x7f, 0x1f, 0x07, 0x9e, 0xee, 0x9b, 0xb3, 0x38, 0x50, 0xf3, 0x56, 0x73, 0xd0, 0xd1, 0xee, 0xe3,
+        0xeb, 0xf3, 0x1b, 0xe0, 0x77, 0x78, 0x75, 0xc6, 0x19, 0xe4, 0x69, 0xaa, 0x73, 0x08, 0xcd, 0x0c,
+        0xf9, 0xed, 0x94, 0xf8, 0x79, 0x86, 0x63, 0x31, 0xbf, 0xd1, 0xe3, 0x17, 0x2b, 0xb9, 0xa1, 0x72,
+    });
+    try testType(@Vector(128, u8), .{
+        0x2e, 0x93, 0x87, 0x09, 0x4f, 0x68, 0x14, 0xab, 0x3f, 0x04, 0x86, 0xc1, 0x95, 0xe8, 0x74, 0x11,
+        0x57, 0x25, 0xe1, 0x88, 0xc0, 0x96, 0x33, 0x99, 0x15, 0x86, 0x2c, 0x84, 0x2e, 0xd7, 0x57, 0x21,
+        0xd3, 0x18, 0xd5, 0x0e, 0xb4, 0x60, 0xe2, 0x08, 0xce, 0xbc, 0xd5, 0x4d, 0x8f, 0x59, 0x01, 0x67,
+        0x71, 0x0a, 0x74, 0x48, 0xef, 0x39, 0x49, 0x7e, 0xa8, 0x39, 0x34, 0x75, 0x95, 0x3b, 0x38, 0xea,
+        0x60, 0xd7, 0xed, 0x8f, 0xbb, 0xc0, 0x7d, 0xc2, 0x79, 0x2d, 0xbf, 0xa5, 0x64, 0xf4, 0x09, 0x86,
+        0xfb, 0x29, 0xfe, 0xc7, 0xff, 0x62, 0x1a, 0x6f, 0xf8, 0xbd, 0xfe, 0xa4, 0xac, 0x24, 0xcf, 0x56,
+        0x82, 0x69, 0x81, 0x0d, 0xc1, 0x51, 0x8d, 0x85, 0xf4, 0x00, 0xe7, 0x25, 0xab, 0xa5, 0x33, 0x45,
+        0x66, 0x2e, 0x33, 0xc8, 0xf3, 0x35, 0x16, 0x7d, 0x1f, 0xc9, 0xf7, 0x44, 0xab, 0x66, 0x28, 0x0d,
+    });
+
+    try testType(@Vector(3, i9), .{ -1 << 8, -1, 0 });
+    try testType(@Vector(3, u9), .{ 0, 1, 1 << 8 });
+
+    try testType(@Vector(3, i15), .{ -1 << 14, -1, 0 });
+    try testType(@Vector(3, u15), .{ 0, 1, 1 << 14 });
+
+    try testType(@Vector(3, i16), .{ -1 << 15, -1, 0 });
+    try testType(@Vector(3, u16), .{ 0, 1, 1 << 15 });
+    try testType(@Vector(1, u16), .{
+        0x4da6,
+    });
+    try testType(@Vector(2, u16), .{
+        0x04d7, 0x50c6,
+    });
+    try testType(@Vector(4, u16), .{
+        0x4c06, 0xd71f, 0x4d8f, 0xe0a4,
+    });
+    try testType(@Vector(8, u16), .{
+        0xee9a, 0x881d, 0x31fb, 0xd3f7, 0x2c74, 0x6949, 0x4e04, 0x53d7,
+    });
+    try testType(@Vector(16, u16), .{
+        0xeafe, 0x9a7b, 0x0d6f, 0x18cb, 0xaf8f, 0x8ee4, 0xa47e, 0xd39a,
+        0x6572, 0x9c53, 0xf36e, 0x982e, 0x41c1, 0x8682, 0xf5dc, 0x7e01,
+    });
+    try testType(@Vector(32, u16), .{
+        0xdfb3, 0x7de6, 0xd9ed, 0xb42e, 0x95ac, 0x9b5b, 0x0422, 0xdfcd,
+        0x6196, 0x4dbe, 0x1818, 0x8816, 0x75e7, 0xc9b0, 0x92f7, 0x1f71,
+        0xe584, 0x576c, 0x043a, 0x0f31, 0xfc4c, 0x2c87, 0x6b02, 0x0229,
+        0x25b7, 0x53cd, 0x9bab, 0x866b, 0x9008, 0xf0f3, 0xeb21, 0x88e2,
+    });
+    try testType(@Vector(64, u16), .{
+        0x084c, 0x445f, 0xce89, 0xd3ee, 0xb399, 0x315d, 0x8ef8, 0x4f6f,
+        0xf9af, 0xcbc4, 0x0332, 0xcd55, 0xa4dc, 0xbc38, 0x6e33, 0x8ead,
+        0xd15a, 0x5057, 0x58ef, 0x657a, 0xe9f0, 0x1418, 0x2b62, 0x3387,
+        0x1c15, 0x04e1, 0x0276, 0x3783, 0xad9c, 0xea9a, 0x0e5e, 0xe803,
+        0x2ee7, 0x0cf1, 0x30f1, 0xb12a, 0x381b, 0x353d, 0xf637, 0xf853,
+        0x2ac1, 0x7ce8, 0x6a50, 0xcbb8, 0xc9b8, 0x9b25, 0xd1e9, 0xeff0,
+        0xc0a2, 0x8e51, 0xde7a, 0x4e58, 0x5685, 0xeb3f, 0xd29b, 0x66ed,
+        0x3dd5, 0xcb59, 0x6003, 0xf710, 0x943a, 0x7276, 0xe547, 0xe48f,
+    });
+
+    try testType(@Vector(3, i17), .{ -1 << 16, -1, 0 });
+    try testType(@Vector(3, u17), .{ 0, 1, 1 << 16 });
+
+    try testType(@Vector(3, i31), .{ -1 << 30, -1, 0 });
+    try testType(@Vector(3, u31), .{ 0, 1, 1 << 30 });
+
+    try testType(@Vector(3, i32), .{ -1 << 31, -1, 0 });
+    try testType(@Vector(3, u32), .{ 0, 1, 1 << 31 });
+    try testType(@Vector(1, u32), .{
+        0x17e2805c,
+    });
+    try testType(@Vector(2, u32), .{
+        0xdb6aadc5, 0xb1ff3754,
+    });
+    try testType(@Vector(4, u32), .{
+        0xf7897b31, 0x342e1af9, 0x190fd76b, 0x283b5374,
+    });
+    try testType(@Vector(8, u32), .{
+        0x81a0bd16, 0xc55da94e, 0x910f7e7c, 0x078d5ef7,
+        0x0bdb1e4a, 0xf1a96e99, 0xcdd729b5, 0xe6966a1c,
+    });
+    try testType(@Vector(16, u32), .{
+        0xfee812db, 0x29eacbed, 0xaed48136, 0x3053de13,
+        0xbbda20df, 0x6faa274a, 0xe0b5ec3a, 0x1878b0dc,
+        0x98204475, 0x810d8d05, 0x1e6996b6, 0xc543826a,
+        0x53b47d8c, 0xc72c3142, 0x12f7e1f9, 0xf6782e54,
+    });
+    try testType(@Vector(32, u32), .{
+        0xf0cf30d3, 0xe3c587b8, 0xcee44739, 0xe4a0bd72,
+        0x41d44cce, 0x6d7c4259, 0xd85580a5, 0xec4b02d7,
+        0xa366483d, 0x2d7b59d4, 0xe9c0ace4, 0x82cb441c,
+        0xa23958ba, 0x04a70148, 0x3f0d20a3, 0xf9e21e37,
+        0x009fce8b, 0x4a34a229, 0xf09c35cf, 0xc0977d4d,
+        0xcc4d4647, 0xa30f1363, 0x27a65b14, 0xe572c785,
+        0x8f42e320, 0x2b2cdeca, 0x11205bd4, 0x739d26aa,
+        0xcbcc2df0, 0x5f7a3649, 0xbde1b7aa, 0x180a169f,
+    });
+
+    try testType(@Vector(3, i33), .{ -1 << 32, -1, 0 });
+    try testType(@Vector(3, u33), .{ 0, 1, 1 << 32 });
+
+    try testType(@Vector(3, i63), .{ -1 << 62, -1, 0 });
+    try testType(@Vector(3, u63), .{ 0, 1, 1 << 62 });
+
+    try testType(@Vector(3, i64), .{ -1 << 63, -1, 0 });
+    try testType(@Vector(3, u64), .{ 0, 1, 1 << 63 });
+    try testType(@Vector(1, u64), .{
+        0x7d2e439abb0edba7,
+    });
+    try testType(@Vector(2, u64), .{
+        0x3749ee5a2d237b9f, 0x6d8f4c3e1378f389,
+    });
+    try testType(@Vector(4, u64), .{
+        0x03c127040e10d52b, 0xa86fe019072e27eb,
+        0x0a554a47b709cdba, 0xf4342cc597e196c3,
+    });
+    try testType(@Vector(8, u64), .{
+        0xea455c104375a055, 0x5c35d9d945edb2fa,
+        0xc11b73d9d9d546fc, 0x2a9d63aae838dd5b,
+        0xed6603f1f5d574b3, 0x2f37b354c81c1e56,
+        0xbe7f5e2476bc76bd, 0xb0c88eacfffa9a8f,
+    });
+    try testType(@Vector(16, u64), .{
+        0x2258fc04b31f8dbe, 0x3a2e5483003a10d8,
+        0xebf24b31c0460510, 0x15d5b4c09b53ffa5,
+        0x05abf6e744b17cc6, 0x9747b483f2d159fe,
+        0x4616d8b2c8673125, 0x8ae3f91d422447eb,
+        0x18da2f101a9e9776, 0x77a1197fb0441007,
+        0x4ba480c8ec2dd10b, 0xeb99b9c0a1725278,
+        0xd9d0acc5084ecdf0, 0xa0a23317fff4f515,
+        0x0901c59a9a6a408b, 0x7c77ca72e25df033,
+    });
+
+    try testType(@Vector(3, i65), .{ -1 << 64, -1, 0 });
+    try testType(@Vector(3, u65), .{ 0, 1, 1 << 64 });
+
+    try testType(@Vector(3, i127), .{ -1 << 126, -1, 0 });
+    try testType(@Vector(3, u127), .{ 0, 1, 1 << 126 });
+
+    try testType(@Vector(3, i128), .{ -1 << 127, -1, 0 });
+    try testType(@Vector(3, u128), .{ 0, 1, 1 << 127 });
+    try testType(@Vector(1, u128), .{
+        0x809f29e7fbafadc01145e1732590e7d9,
+    });
+    try testType(@Vector(2, u128), .{
+        0x5150ac3438aacd0d51132cc2723b2995,
+        0x151be9c47ad29cf719cf8358dd40165c,
+    });
+    try testType(@Vector(4, u128), .{
+        0x4bae22df929f2f7cb9bd84deaad3e7a8,
+        0x1ed46b2d6e1f3569f56b2ac33d8bc1cb,
+        0xae93ea459d2ccfd5fb794e6d5c31aabb,
+        0xb1177136acf099f550b70949ac202ec4,
+    });
+    try testType(@Vector(8, u128), .{
+        0x7cd78db6baed6bfdf8c5265136c4e0fd,
+        0xa41b8984c6bbde84640068194b7eba98,
+        0xd33102778f2ae1a48d1e9bf8801bbbf0,
+        0x0d59f6de003513a60055c86cbce2c200,
+        0x825579d90012afddfbf04851c0748561,
+        0xc2647c885e9d6f0ee1f5fac5da8ef7f5,
+        0xcb4bbc1f81aa8ee68aa4dc140745687b,
+        0x4ff10f914f74b46c694407f5bf7c7836,
+    });
+
+    try testType(@Vector(3, i129), .{ -1 << 128, -1, 0 });
+    try testType(@Vector(3, u129), .{ 0, 1, 1 << 128 });
+
+    try testType(@Vector(3, i191), .{ -1 << 190, -1, 0 });
+    try testType(@Vector(3, u191), .{ 0, 1, 1 << 190 });
+
+    try testType(@Vector(3, i192), .{ -1 << 191, -1, 0 });
+    try testType(@Vector(3, u192), .{ 0, 1, 1 << 191 });
+    try testType(@Vector(1, u192), .{
+        0xe7baafcb9781626a77571b0539b9471a60c97d6c02106c8b,
+    });
+    try testType(@Vector(2, u192), .{
+        0xbc9510913ed09e2c2aa50ffab9f1bc7b303a87f36e232a83,
+        0x1f37bee446d7712d1ad457c47a66812cb926198d052aee65,
+    });
+    try testType(@Vector(4, u192), .{
+        0xdca6a7cfc19c69efc34022062a8ca36f2569ab3dce001202,
+        0xd25a4529e621c9084181fdb6917c6a32eccc58b63601b35d,
+        0x0a258afd6debbaf8c158f1caa61fed63b31871d13f51b43d,
+        0x6b40a178674fcb82c623ac322f851623d5e993dac97a219a,
+    });
+
+    try testType(@Vector(3, i193), .{ -1 << 192, -1, 0 });
+    try testType(@Vector(3, u193), .{ 0, 1, 1 << 192 });
+
+    try testType(@Vector(3, i255), .{ -1 << 254, -1, 0 });
+    try testType(@Vector(3, u255), .{ 0, 1, 1 << 254 });
+
+    try testType(@Vector(3, i256), .{ -1 << 255, -1, 0 });
+    try testType(@Vector(3, u256), .{ 0, 1, 1 << 255 });
+    try testType(@Vector(1, u256), .{
+        0x230413bb481fa3a997796acf282010c560d1942e7339fd584a0f15a90c83fbda,
+    });
+    try testType(@Vector(2, u256), .{
+        0x3ad569f8d91fdbc9da8ec0e933565919f2feb90b996c90c352b461aa0908e62d,
+        0x0f109696d64647983f1f757042515510729ad1350e862cbf38cb73b5cf99f0f7,
+    });
+    try testType(@Vector(4, u256), .{
+        0x1717c6ded4ac6de282d59f75f068da47d5a47a30f2c5053d2d59e715f9d28b97,
+        0x3087189ce7540e2e0028b80af571ebc6353a00b2917f243a869ed29ecca0adaa,
+        0x1507c6a9d104684bf503cdb08841cf91adab4644306bd67aafff5326604833ce,
+        0x857e134ff9179733c871295b25f824bd3eb562977bad30890964fa0cdc15bb07,
+    });
+
+    try testType(@Vector(3, i257), .{ -1 << 256, -1, 0 });
+    try testType(@Vector(3, u257), .{ 0, 1, 1 << 256 });
+
+    try testType(@Vector(3, i511), .{ -1 << 510, -1, 0 });
+    try testType(@Vector(3, u511), .{ 0, 1, 1 << 510 });
+
+    try testType(@Vector(3, i512), .{ -1 << 511, -1, 0 });
+    try testType(@Vector(3, u512), .{ 0, 1, 1 << 511 });
+    try testType(@Vector(1, u512), .{
+        0xa3ff51a609f1370e5eeb96b05169bf7469e465cf76ac5b4ea8ffd166c1ba3cd94f2dedf0d647a1fe424f3a06e6d7940f03e257f28100970b00bd5528c52b9ae6,
+    });
+    try testType(@Vector(2, u512), .{
+        0xc6d43cd46ae31ab71f9468a895c83bf17516c6b2f1c9b04b9aa113bf7fe1b789eb7d95fcf951f12a9a6f2124589551efdd8c00f528b366a7bfb852faf8f3da53,
+        0xc9099d2bdf8d1a0d30485ec6db4a24cbc0d89a863de30e18313ee1d66f71dd2d26235caaa703286cf4a2b51e1a12ef96d2d944c66c0bd3f0d72dd4cf0fc8100e,
+    });
+
+    try testType(@Vector(3, i513), .{ -1 << 512, -1, 0 });
+    try testType(@Vector(3, u513), .{ 0, 1, 1 << 512 });
+
+    try testType(@Vector(3, i1023), .{ -1 << 1022, -1, 0 });
+    try testType(@Vector(3, u1023), .{ 0, 1, 1 << 1022 });
+
+    try testType(@Vector(3, i1024), .{ -1 << 1023, -1, 0 });
+    try testType(@Vector(3, u1024), .{ 0, 1, 1 << 1023 });
+    try testType(@Vector(1, u1024), .{
+        0xc6cfaa6571139552e1f067402dfc131d9b9a58aafda97198a78764b05138fb68cf26f085b7652f3d5ae0e56aa21732f296a581bb411d4a73795c213de793489fa49b173b9f5c089aa6295ff1fcdc14d491a05035b45d08fc35cd67a83d887a02b8db512f07518132e0ba56533c7d6fbe958255eddf5649bd8aba288c0dd84a25,
+    });
+
+    try testType(@Vector(3, i1025), .{ -1 << 1024, -1, 0 });
+    try testType(@Vector(3, u1025), .{ 0, 1, 1 << 1024 });
+}
+
 fn testBinary(comptime op: anytype) !void {
     const testType = struct {
         fn testType(comptime Type: type, comptime imm_lhs: Type, comptime imm_rhs: Type) !void {
@@ -306,6 +1012,63 @@ fn testBinary(comptime op: anytype) !void {
         0x8b0b4a27fc94a0e90652d19bc755b63d,
         0xa858bce5ad0e48c13588a4e170e8667c,
     });
+
+    try testType(@Vector(1, u256), .{
+        0x28df37e1f57a56133ba3f5b5b2164ce24eb6c29a8973a597fd91fbee8ab4bafb,
+    }, .{
+        0x63f725028cab082b5b1e6cb474428c8c3655cf438f3bb05c7a87f8270198f357,
+    });
+    try testType(@Vector(2, u256), .{
+        0xcc79740b85597ef411e6d7e92049dfaa2328781ea4911540a3dcb512b71c7f3c,
+        0x51ae46d2f93cbecff1578481f6ddc633dacee94ecaf81597c752c5c5db0ae766,
+    }, .{
+        0x257f0107305cb71cef582a9a58612a019f335e390d7998f51f5898f245874a6e,
+        0x0a95a17323a4d16a715720f122b752785e9877e3dd3d3f9b72cdac3d1139a81f,
+    });
+    try testType(@Vector(4, u256), .{
+        0x19667a6e269342cba437a8904c7ba42a762358d32723723ae2637b01124e63c5,
+        0x14f7d3599a7edc7bcc46874f68d4291793e6ef72bd1f3763bc5e923f54f2f781,
+        0x1c939de0ae980b80de773a04088ba45813441336cdfdc281ee356c98d71f653b,
+        0x39f5d755965382fe13d1b1d6690b8e3827f153f8166768c4ad8a28a963b781f2,
+    }, .{
+        0xbe03de37cdcb8126083b4e86cd8a9803121d31b186fd5ce555ad77ce624dd6c7,
+        0xa0c0730f0d7f141cc959849d09730b049f00693361539f1bc4758270554a60c1,
+        0x2664bdba8de4eaa36ecee72f6bfec5b4daa6b4e00272d8116f2cc532c29490cc,
+        0xe47a122bd45d5e7d69722d864a6b795ddee965a0993094f8791dd309d692de8b,
+    });
+
+    try testType(@Vector(1, u512), .{
+        0x651058c1d89a8f34cfc5e66b6d25294eecfcc4a7e1e4a356eb51ee7d7b2db25378e4afee51b7d18d16e520772a60c50a02d7966f40ced1870b32c658e5821397,
+    }, .{
+        0xd726e265ec80cb99510ba4f480ca64e959de5c528a7f54c386ecad22eeeefa845f0fd44b1bd64258a5f868197ee2d8fed59df9c9f0b72e74051a7ff20230880e,
+    });
+    try testType(@Vector(2, u512), .{
+        0x22c8183c95cca8b09fdf541e431b73e9e4a1a5a00dff12381937fab52681d09d38ea25727d7025a2be08942cfa01535759e1644792e347c7901ec94b343c6337,
+        0x292fdf644e75927e1aea9465ae2f60fb27550cd095f1afdea2cf7855286d26fbeed1c0b9c0474b73cb6b75621f7eadaa2f94ec358179ce2aaa0766df20da1ef3,
+    }, .{
+        0xe1cd8c0ca244c6626d4415e10b4ac43fa69e454c529c24fec4b13e6b945684d4ea833709c16c636ca78cffa5c5bf0fe945cd714a9ad695184a6bdad31dec9e31,
+        0x8fa3d86099e9e2789d72f8e792290356d659ab20ac0414ff94745984c6ae7d986082197bb849889f912e896670aa2c1a11bd7e66e3f650710b0f0a18a1533f90,
+    });
+
+    try testType(@Vector(1, u1024), .{
+        0x0ca1a0dfaf8bb1da714b457d23c71aef948e66c7cd45c0aa941498a796fb18502ec32f34e885d0a107d44ae81595f8b52c2f0fb38e584b7139903a0e8a823ae20d01ca0662722dd474e7efc40f32d74cc065d97d8a09d0447f1ab6107fa0a57f3f8c866ae872506627ce82f18add79cee8dc69837f4ead3ca770c4d622d7e544,
+    }, .{
+        0xf1e3bbe031d59351770a7a501b6e969b2c00d144f17648db3f944b69dfeb7be72e5ff933a061eba4eaa422f8ca09e5a97d0b0dd740fd4076eba8c72d7a278523f399202dc2d043c4e0eb58a2bcd4066e2146e321810b1ee4d3afdddb4f026bcc7905ce17e033a7727b4e08f33b53c63d8c9f763fc6c31d0523eb38c30d5e40bc,
+    });
+}
+
+inline fn bitNot(comptime Type: type, rhs: Type) @TypeOf(~rhs) {
+    return ~rhs;
+}
+test bitNot {
+    try testUnary(bitNot);
+}
+
+inline fn clz(comptime Type: type, rhs: Type) @TypeOf(@clz(rhs)) {
+    return @clz(rhs);
+}
+test clz {
+    try testUnary(clz);
 }
 
 inline fn bitAnd(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs & rhs) {
test/behavior/math.zig
@@ -65,6 +65,8 @@ test "@clz" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try testClz();
     try comptime testClz();
@@ -75,6 +77,7 @@ fn testClz() !void {
     try expect(testOneClz(u8, 0b00001010) == 4);
     try expect(testOneClz(u8, 0b00011010) == 3);
     try expect(testOneClz(u8, 0b00000000) == 8);
+    try expect(testOneClz(i8, -1) == 0);
 }
 
 test "@clz big ints" {
@@ -100,7 +103,7 @@ fn testOneClz(comptime T: type, x: T) u32 {
 
 test "@clz vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -159,6 +162,8 @@ fn testCtz() !void {
     try expect(testOneCtz(u8, 0b10100000) == 5);
     try expect(testOneCtz(u8, 0b10001010) == 1);
     try expect(testOneCtz(u8, 0b00000000) == 8);
+    try expect(testOneCtz(i8, -1) == 0);
+    try expect(testOneCtz(i8, -2) == 1);
     try expect(testOneCtz(u16, 0b00000000) == 16);
 }
 
@@ -1712,7 +1717,7 @@ test "mod lazy values" {
 
 test "@clz works on both vector and scalar inputs" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
test/behavior/vector.zig
@@ -646,7 +646,7 @@ test "vector division operators" {
 
 test "vector bitwise not operator" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
test/behavior/x86_64.zig
@@ -1,8 +1,7 @@
 //! CodeGen tests for the x86_64 backend.
 
-const builtin = @import("builtin");
-
 test {
+    const builtin = @import("builtin");
     if (builtin.zig_backend != .stage2_x86_64) return error.SkipZigTest;
     if (builtin.object_format == .coff) return error.SkipZigTest;
     _ = @import("x86_64/math.zig");
tools/update_cpu_features.zig
@@ -902,8 +902,8 @@ const llvm_targets = [_]LlvmTarget{
                 .features = &.{ "v8a", "exynos" },
             },
         },
-        // LLVM removed support for v2 and v3 but zig wants to support targeting old hardware
         .extra_features = &.{
+            // LLVM removed support for v2 and v3 but zig wants to support targeting old hardware
             .{
                 .zig_name = "v2",
                 .desc = "ARMv2 architecture",
@@ -1043,10 +1043,22 @@ const llvm_targets = [_]LlvmTarget{
                 .llvm_name = "64bit-mode",
                 .omit = true,
             },
+            .{
+                .llvm_name = "alderlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
             .{
                 .llvm_name = "amdfam10",
                 .extra_deps = &.{"3dnowa"},
             },
+            .{
+                .llvm_name = "arrowlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "arrowlake-s",
+                .extra_deps = &.{ "smap", "smep" },
+            },
             .{
                 .llvm_name = "athlon",
                 .extra_deps = &.{"3dnowa"},
@@ -1081,16 +1093,64 @@ const llvm_targets = [_]LlvmTarget{
             },
             .{
                 .llvm_name = "barcelona",
-                .extra_deps = &.{"3dnowa"},
+                .extra_deps = &.{ "3dnowa", "smap", "smep" },
+            },
+            .{
+                .llvm_name = "broadwell",
+                .extra_deps = &.{ "smap", "smep" },
             },
             .{
                 .llvm_name = "c3",
                 .extra_deps = &.{"3dnow"},
             },
+            .{
+                .llvm_name = "cannonlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "cascadelake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "emeraldrapids",
+                .extra_deps = &.{ "smap", "smep" },
+            },
             .{
                 .llvm_name = "geode",
                 .extra_deps = &.{"3dnowa"},
             },
+            .{
+                .llvm_name = "goldmont",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "goldmont_plus",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "haswell",
+                .extra_deps = &.{"smep"},
+            },
+            .{
+                .llvm_name = "i386",
+                .extra_deps = &.{"bsf_bsr_0_clobbers_result"},
+            },
+            .{
+                .llvm_name = "i486",
+                .extra_deps = &.{"bsf_bsr_0_clobbers_result"},
+            },
+            .{
+                .llvm_name = "icelake_client",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "icelake_server",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "ivybridge",
+                .extra_deps = &.{"smep"},
+            },
             .{
                 .llvm_name = "k6-2",
                 .extra_deps = &.{"3dnow"},
@@ -1127,6 +1187,10 @@ const llvm_targets = [_]LlvmTarget{
                 .llvm_name = "lakemont",
                 .extra_deps = &.{"soft_float"},
             },
+            .{
+                .llvm_name = "meteorlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
             .{
                 .llvm_name = "opteron",
                 .extra_deps = &.{"3dnowa"},
@@ -1135,6 +1199,38 @@ const llvm_targets = [_]LlvmTarget{
                 .llvm_name = "opteron-sse3",
                 .extra_deps = &.{"3dnowa"},
             },
+            .{
+                .llvm_name = "raptorlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "rocketlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "sapphirerapids",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "silvermont",
+                .extra_deps = &.{"smep"},
+            },
+            .{
+                .llvm_name = "skx",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "skylake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "skylake_avx512",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "tigerlake",
+                .extra_deps = &.{ "smap", "smep" },
+            },
             .{
                 .llvm_name = "winchip2",
                 .extra_deps = &.{"3dnow"},
@@ -1143,9 +1239,29 @@ const llvm_targets = [_]LlvmTarget{
                 .llvm_name = "sse4.2",
                 .extra_deps = &.{"crc32"},
             },
+            .{
+                .llvm_name = "znver1",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "znver2",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "znver3",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "znver4",
+                .extra_deps = &.{ "smap", "smep" },
+            },
+            .{
+                .llvm_name = "znver5",
+                .extra_deps = &.{ "smap", "smep" },
+            },
         },
-        // Features removed from LLVM
         .extra_features = &.{
+            // Features removed from LLVM
             .{
                 .zig_name = "3dnow",
                 .desc = "Enable 3DNow! instructions",
@@ -1171,6 +1287,22 @@ const llvm_targets = [_]LlvmTarget{
                 .desc = "Prefetch with Intent to Write and T1 Hint",
                 .deps = &.{},
             },
+            // Custom Zig features
+            .{
+                .zig_name = "bsf_bsr_0_clobbers_result",
+                .desc = "BSF/BSR may clobber the lower 32-bits of the result register when the source is zero",
+                .deps = &.{},
+            },
+            .{
+                .zig_name = "smap",
+                .desc = "Enable Supervisor Mode Access Prevention",
+                .deps = &.{},
+            },
+            .{
+                .zig_name = "smep",
+                .desc = "Enable Supervisor Mode Execution Prevention",
+                .deps = &.{},
+            },
         },
         .omit_cpus = &.{
             // LLVM defines a bunch of dumb aliases with foreach loops in X86.td.