Commit 612f5784cf

Jacob Young <jacobly0@users.noreply.github.com>
2025-05-23 18:43:58
x86_64: implement optimized float `@reduce(.Add)`
1 parent 7c31f9d
src/arch/x86_64/CodeGen.zig
@@ -2389,7 +2389,7 @@ fn genBodyBlock(self: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 }
 
 fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
-    @setEvalBranchQuota(23_100);
+    @setEvalBranchQuota(23_600);
     const pt = cg.pt;
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
@@ -2427,7 +2427,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
             // zig fmt: off
             .select           => try cg.airSelect(inst),
             .shuffle          => try cg.airShuffle(inst),
-            .reduce_optimized => try cg.airReduce(inst),
             // zig fmt: on
 
             .arg => if (cg.debug_output != .none) {
@@ -67795,7 +67794,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
                         .{ ._, ._, .xor, .dst0d, .dst0d, ._, ._ },
-                        .{ ._, ._, .movbe, .dst0w, .src0w, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0w, .src0w, ._, ._ },
                     } },
                 }, .{
                     .src_constraints = .{ .{ .exact_int = 16 }, .any, .any },
@@ -67815,7 +67814,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .dst_temps = .{ .{ .rc = .general_purpose }, .unused },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .movbe, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0d, .src0d, ._, ._ },
                     } },
                 }, .{
                     .src_constraints = .{ .{ .exact_int = 32 }, .any, .any },
@@ -67824,7 +67823,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .dst0d, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0d, ._, ._, ._ },
                     } },
                 }, .{
                     .required_features = .{ .movbe, null, null, null },
@@ -67835,7 +67834,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .rc = .general_purpose }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .movbe, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0d, .src0d, ._, ._ },
                         .{ ._, ._r, .sa, .dst0d, .uia(32, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67846,7 +67845,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .dst0d, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0d, ._, ._, ._ },
                         .{ ._, ._r, .sa, .dst0d, .uia(32, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67858,7 +67857,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .rc = .general_purpose }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .movbe, .dst0d, .src0d, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0d, .src0d, ._, ._ },
                         .{ ._, ._r, .sh, .dst0d, .uia(32, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67869,7 +67868,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .dst0d, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0d, ._, ._, ._ },
                         .{ ._, ._r, .sh, .dst0d, .uia(32, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67880,7 +67879,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .dst_temps = .{ .{ .rc = .general_purpose }, .unused },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .movbe, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0q, .src0q, ._, ._ },
                     } },
                 }, .{
                     .src_constraints = .{ .{ .exact_int = 64 }, .any, .any },
@@ -67889,7 +67888,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                     } },
                 }, .{
                     .required_features = .{ .movbe, null, null, null },
@@ -67900,7 +67899,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .rc = .general_purpose }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .movbe, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0q, .src0q, ._, ._ },
                         .{ ._, ._r, .sa, .dst0q, .uia(64, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67911,7 +67910,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                         .{ ._, ._r, .sa, .dst0q, .uia(64, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67923,7 +67922,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .rc = .general_purpose }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .movbe, .dst0q, .src0q, ._, ._ },
+                        .{ ._, ._be, .mov, .dst0q, .src0q, ._, ._ },
                         .{ ._, ._r, .sh, .dst0q, .uia(64, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67934,7 +67933,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                         .{ ._, ._r, .sh, .dst0q, .uia(64, .src0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -67962,7 +67961,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp0d, .sia(-8, .dst0, .add_size), ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp2q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .movbe, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
+                        .{ ._, ._be, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
@@ -67992,7 +67991,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp0d, .sia(-8, .dst0, .add_size), ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp2q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp2q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp2q, ._, ._, ._ },
                         .{ ._, ._, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
@@ -68026,7 +68025,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._r, .sa, .tmp2q, .ui(63), ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp2q, ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp2q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .movbe, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
+                        .{ ._, ._be, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
@@ -68059,7 +68058,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._r, .sa, .tmp2q, .ui(63), ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp2q, ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp2q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp2q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp2q, ._, ._, ._ },
                         .{ ._, ._, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
@@ -68091,7 +68090,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .si(0), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp2q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .movbe, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
+                        .{ ._, ._be, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
@@ -68122,7 +68121,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .si(0), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp2q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp2q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp2q, ._, ._, ._ },
                         .{ ._, ._, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
@@ -68155,7 +68154,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .movsx, .tmp2q, .mem(.src0b), ._, ._ },
                         .{ ._, ._r, .sa, .tmp2q, .ui(63), ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp2q, ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp3q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68192,7 +68191,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._r, .sa, .tmp2q, .ui(63), ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp2q, ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp3q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp3q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68227,7 +68226,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp2q, ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp3q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68263,7 +68262,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
                         .{ ._, ._, .mov, .memad(.dst0q, .add_size, -8), .tmp2q, ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp3q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp3q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68298,7 +68297,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ ._, ._, .movsx, .tmp2q, .mem(.src0b), ._, ._ },
                         .{ ._, ._r, .sa, .tmp2q, .ui(63), ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp3q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68334,7 +68333,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .movsx, .tmp2q, .mem(.src0b), ._, ._ },
                         .{ ._, ._r, .sa, .tmp2q, .ui(63), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp3q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp3q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68368,7 +68367,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp0d, .sia(-8, .dst0, .add_size), ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp3q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -68403,7 +68402,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
                         .{ ._, ._, .xor, .tmp2d, .tmp2d, ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp3q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp3q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp3q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .tmp3q, ._, ._ },
                         .{ ._, ._rd, .sh, .tmp3q, .tmp2q, .uia(64, .src0, .sub_bit_size_rem_64), ._ },
                         .{ ._, ._, .mov, .tmp2q, .tmp4q, ._, ._ },
@@ -69742,7 +69741,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .src0d, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .src0d, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp0d, .src0d, ._, ._ },
                         .{ ._, ._r, .sh, .src0d, .ui(4), ._, ._ },
                         .{ ._, ._l, .sh, .tmp0d, .ui(4), ._, ._ },
@@ -69897,7 +69896,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .src0d, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .src0d, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp0d, .src0d, ._, ._ },
                         .{ ._, ._r, .sh, .src0d, .ui(4), ._, ._ },
                         .{ ._, ._l, .sh, .tmp0d, .ui(4), ._, ._ },
@@ -70053,7 +70052,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .src0d, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .src0d, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp0d, .src0d, ._, ._ },
                         .{ ._, ._r, .sh, .src0d, .ui(4), ._, ._ },
                         .{ ._, ._l, .sh, .tmp0d, .ui(4), ._, ._ },
@@ -70150,7 +70149,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .dst0p, .mem(.tmp0), ._, ._ },
                         .{ ._, ._b, .gf2p8affineq, .src0x, .lea(.dst0x), .ui(0), ._ },
                         .{ ._, ._q, .mov, .dst0q, .src0x, ._, ._ },
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                     } },
                 }, .{
                     .required_features = .{ .@"64bit", null, null, null },
@@ -70174,7 +70173,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .src0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .src0q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp0q, .src0q, ._, ._ },
                         .{ ._, ._, .mov, .tmp1q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .@"and", .tmp0q, .tmp1q, ._, ._ },
@@ -70222,7 +70221,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .dst0p, .mem(.tmp0), ._, ._ },
                         .{ ._, .v_b, .gf2p8affineq, .tmp1x, .src0x, .lea(.dst0x), .ui(0) },
                         .{ ._, .v_q, .mov, .dst0q, .tmp1x, ._, ._ },
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                         .{ ._, ._r, .sa, .dst0q, .uia(64, .dst0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -70250,7 +70249,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .dst0p, .mem(.tmp0), ._, ._ },
                         .{ ._, ._b, .gf2p8affineq, .src0x, .lea(.dst0x), .ui(0), ._ },
                         .{ ._, ._q, .mov, .dst0q, .src0x, ._, ._ },
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                         .{ ._, ._r, .sa, .dst0q, .uia(64, .dst0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -70275,7 +70274,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .src0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .src0q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp0q, .src0q, ._, ._ },
                         .{ ._, ._, .mov, .tmp1q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .@"and", .tmp0q, .tmp1q, ._, ._ },
@@ -70378,7 +70377,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .lea, .dst0p, .mem(.tmp0), ._, ._ },
                         .{ ._, ._b, .gf2p8affineq, .src0x, .lea(.dst0x), .ui(0), ._ },
                         .{ ._, ._q, .mov, .dst0q, .src0x, ._, ._ },
-                        .{ ._, ._, .bswap, .dst0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .dst0q, ._, ._, ._ },
                         .{ ._, ._r, .sh, .dst0q, .uia(64, .dst0, .sub_bit_size), ._, ._ },
                     } },
                 }, .{
@@ -70403,7 +70402,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .ref = .src0 }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bswap, .src0q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .src0q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp0q, .src0q, ._, ._ },
                         .{ ._, ._, .mov, .tmp1q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .@"and", .tmp0q, .tmp1q, ._, ._ },
@@ -70646,10 +70645,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_32_u8, .kind = .forward_bits_mem },
-                        .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .repeat = 2, .size = .xword } } },
+                        .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .unused,
-                        .unused,
+                        .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -70661,9 +70660,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .each = .{ .once = &.{
                         .{ ._, ._, .lea, .tmp2p, .mem(.tmp0), ._, ._ },
                         .{ ._, .v_b, .gf2p8affineq, .dst0y, .src0y, .lea(.tmp2y), .ui(0) },
+                        .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
                         .{ ._, ._, .lea, .tmp2p, .mem(.tmp1), ._, ._ },
-                        .{ ._, .vp_b, .shuf, .dst0y, .dst0y, .lea(.tmp2y), ._ },
-                        .{ ._, .v_pd, .perm, .dst0y, .dst0y, .ui(0b01_00_11_10), ._ },
+                        .{ ._, .v_dqa, .mov, .tmp4x, .lea(.tmp2x), ._, ._ },
+                        .{ ._, .vp_b, .shuf, .dst0x, .dst0x, .tmp4x, ._ },
+                        .{ ._, .vp_b, .shuf, .tmp3x, .tmp3x, .tmp4x, ._ },
+                        .{ ._, .v_f128, .insert, .dst0y, .tmp3y, .dst0x, .ui(1) },
                     } },
                 }, .{
                     .required_features = .{ .avx2, .gfni, null, null },
@@ -70701,42 +70703,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx, .gfni, null, null },
-                    .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .yword, .is = 256 } }, .any, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .to_mem, .none, .none } },
-                    },
-                    .extra_temps = .{
-                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .type = .vector_32_u8, .kind = .forward_bits_mem },
-                        .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .repeat = 2, .size = .xword } } },
-                        .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
-                        .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
-                        .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
-                        .unused,
-                        .unused,
-                        .unused,
-                        .unused,
-                    },
-                    .dst_temps = .{ .mem, .unused },
-                    .clobbers = .{ .eflags = true },
-                    .each = .{ .once = &.{
-                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
-                        .{ ._, .v_dqa, .mov, .tmp4y, .lea(.tmp0y), ._, ._ },
-                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp3), ._, ._ },
-                        .{ ._, .v_dqa, .mov, .tmp5y, .lea(.tmp0y), ._, ._ },
-                        .{ ._, ._, .mov, .tmp0d, .sia(-32, .dst0, .add_size), ._, ._ },
-                        .{ ._, ._, .lea, .tmp1p, .mem(.src0), ._, ._ },
-                        .{ .@"0:", .v_pd, .perm, .tmp6y, .lea(.tmp1y), .ui(0b01_00_11_10), ._ },
-                        .{ ._, .v_b, .gf2p8affineq, .tmp6y, .tmp6y, .tmp4y, .ui(0) },
-                        .{ ._, .vp_b, .shuf, .tmp6y, .tmp6y, .tmp5y, ._ },
-                        .{ ._, .v_dqu, .mov, .memi(.dst0y, .tmp0), .tmp6y, ._, ._ },
-                        .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 32), ._, ._ },
-                        .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
-                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx, .gfni, null, null },
                     .src_constraints = .{ .{ .exact_remainder_int = .{ .of = .xword, .is = 128 } }, .any, .any },
@@ -70819,7 +70785,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._l, .sh, .tmp6q, .ui(1), ._, ._ },
                         .{ ._, ._, .@"and", .tmp5q, .tmp4q, ._, ._ },
                         .{ ._, ._, .@"or", .tmp5q, .tmp6q, ._, ._ },
-                        .{ ._, ._, .movbe, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
+                        .{ ._, ._be, .mov, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
@@ -70870,7 +70836,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._l, .sh, .tmp6q, .ui(1), ._, ._ },
                         .{ ._, ._, .@"and", .tmp5q, .tmp4q, ._, ._ },
                         .{ ._, ._, .@"or", .tmp5q, .tmp6q, ._, ._ },
-                        .{ ._, ._, .bswap, .tmp5q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp5q, ._, ._, ._ },
                         .{ ._, ._, .mov, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
@@ -70926,7 +70892,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._l, .sh, .tmp6q, .ui(1), ._, ._ },
                         .{ ._, ._, .@"and", .tmp5q, .tmp4q, ._, ._ },
                         .{ ._, ._, .@"or", .tmp5q, .tmp6q, ._, ._ },
-                        .{ ._, ._, .movbe, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
+                        .{ ._, ._be, .mov, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
@@ -70981,7 +70947,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._l, .sh, .tmp6q, .ui(1), ._, ._ },
                         .{ ._, ._, .@"and", .tmp5q, .tmp4q, ._, ._ },
                         .{ ._, ._, .@"or", .tmp5q, .tmp6q, ._, ._ },
-                        .{ ._, ._, .bswap, .tmp5q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp5q, ._, ._, ._ },
                         .{ ._, ._, .mov, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
@@ -71034,7 +71000,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._l, .sh, .tmp6q, .ui(1), ._, ._ },
                         .{ ._, ._, .@"and", .tmp5q, .tmp4q, ._, ._ },
                         .{ ._, ._, .@"or", .tmp5q, .tmp6q, ._, ._ },
-                        .{ ._, ._, .movbe, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
+                        .{ ._, ._be, .mov, .memi(.dst0q, .tmp0), .tmp5q, ._, ._ },
                         .{ ._, ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
                         .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                         .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
@@ -71068,7 +71034,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp3q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp5q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp5q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp5q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp6q, .tmp5q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp6q, .tmp2q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp5q, .ui(4), ._, ._ },
@@ -71123,7 +71089,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp3q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp6q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71182,7 +71148,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp6q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp6q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71238,7 +71204,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp3q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp6q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71295,7 +71261,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp6q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp6q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71352,7 +71318,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp3q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp6q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71410,7 +71376,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp6q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp6q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71465,7 +71431,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp3q, .uia(0b00001111000011110000111100001111, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
-                        .{ .@"0:", ._, .movbe, .tmp6q, .lea(.tmp1q), ._, ._ },
+                        .{ .@"0:", ._be, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -71521,7 +71487,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .mov, .tmp4q, .uia(0b00110011001100110011001100110011, .none, .repeat), ._, ._ },
                         .{ ._, ._, .mov, .tmp5q, .uia(0b01010101010101010101010101010101, .none, .repeat), ._, ._ },
                         .{ .@"0:", ._, .mov, .tmp6q, .lea(.tmp1q), ._, ._ },
-                        .{ ._, ._, .bswap, .tmp6q, ._, ._, ._ },
+                        .{ ._, .b_, .swap, .tmp6q, ._, ._, ._ },
                         .{ ._, ._, .mov, .tmp7q, .tmp6q, ._, ._ },
                         .{ ._, ._, .@"and", .tmp7q, .tmp3q, ._, ._ },
                         .{ ._, ._r, .sh, .tmp6q, .ui(4), ._, ._ },
@@ -98883,7 +98849,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
                         .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
-                        .{ ._, .v_dqa, .mov, .tmp2x, .lea(.tmp0x), ._, ._ },
+                        .{ ._, .v_ps, .mova, .tmp2x, .lea(.tmp0x), ._, ._ },
                         .{ ._, ._, .mov, .tmp0p, .sa(.dst0, .sub_unaligned_size), ._, ._ },
                         .{ .@"0:", .v_ps, .cvtph2, .tmp3x, .memsia(.src0q, .@"2", .tmp0, .add_unaligned_size), ._, ._ },
                         .{ ._, .v_, .cvttps2dq, .tmp3x, .tmp3x, ._, ._ },
@@ -118968,26 +118934,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 } },
                             } },
                         } ++ [_]Select.Case{ .{
-                            .required_features = .{ .sse, null, null, null },
-                            .dst_constraints = .{ .{ .int = .byte }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .mut_gpr, .none, .none } },
-                                .{ .src = .{ .to_mut_sse, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
-                            .dst_constraints = .{ .{ .int = .byte }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .to_mut_gpr, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
                             .required_features = .{ .avx, null, null, null },
                             .dst_constraints = .{ .{ .int = .byte }, .any },
                             .src_constraints = .{ .{ .scalar_int = .{ .of = .word, .is = .byte } }, .any, .any },
@@ -119627,7 +119573,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                                 .{ ._, .vp_, mir_tag, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -119669,7 +119615,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -119717,6 +119663,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -119755,6 +119702,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -119793,6 +119741,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -119838,6 +119787,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -119897,6 +119847,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -119948,6 +119899,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -119999,6 +119951,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -120025,7 +119978,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         }, .{
                             .required_features = .{ .slow_incdec, null, null, null },
                             .dst_constraints = .{ .{ .int = .byte }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -120053,7 +120006,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             } },
                         }, .{
                             .dst_constraints = .{ .{ .int = .byte }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -120079,26 +120032,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._c, .de, .tmp0d, ._, ._, ._ },
                                 .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
                             } },
-                        }, .{
-                            .required_features = .{ .sse, null, null, null },
-                            .dst_constraints = .{ .{ .int = .word }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .mut_gpr, .none, .none } },
-                                .{ .src = .{ .to_mut_sse, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
-                            .dst_constraints = .{ .{ .int = .word }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .to_mut_gpr, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
                         }, .{
                             .required_features = .{ .avx, null, null, null },
                             .dst_constraints = .{ .{ .int = .word }, .any },
@@ -120573,7 +120506,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                                 .{ ._, .vp_, mir_tag, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -120613,7 +120546,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -120659,6 +120592,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -120695,6 +120629,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -120731,6 +120666,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -120773,6 +120709,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -120830,6 +120767,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -120879,6 +120817,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -120928,6 +120867,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -120950,7 +120890,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             } },
                         }, .{
                             .dst_constraints = .{ .{ .int = .word }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .word, .is = .word } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .word, .is = .word } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -120976,26 +120916,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
                                 .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             } },
-                        }, .{
-                            .required_features = .{ .sse, null, null, null },
-                            .dst_constraints = .{ .{ .int = .dword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .mut_gpr, .none, .none } },
-                                .{ .src = .{ .to_mut_sse, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
-                            .dst_constraints = .{ .{ .int = .dword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .to_mut_gpr, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
                         }, .{
                             .required_features = .{ .avx, null, null, null },
                             .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -121316,7 +121236,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                                 .{ ._, .vp_, mir_tag, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -121354,7 +121274,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -121398,6 +121318,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -121432,6 +121353,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -121466,6 +121388,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -121505,6 +121428,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -121560,6 +121484,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -121607,6 +121532,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -121654,6 +121580,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -121673,7 +121600,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             } },
                         }, .{
                             .dst_constraints = .{ .{ .int = .dword }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .dword, .is = .dword } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .dword, .is = .dword } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -121699,27 +121626,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .sub, .tmp0d, .si(4), ._, ._ },
                                 .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             } },
-                        }, .{
-                            .required_features = .{ .sse, null, null, null },
-                            .dst_constraints = .{ .{ .int = .qword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .mut_gpr, .none, .none } },
-                                .{ .src = .{ .to_mut_sse, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
-                            .required_features = .{ .@"64bit", null, null, null },
-                            .dst_constraints = .{ .{ .int = .qword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .to_mut_gpr, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
                         }, .{
                             .required_features = .{ .avx, null, null, null },
                             .dst_constraints = .{ .{ .int = .qword }, .any },
@@ -121898,7 +121804,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                                 .{ ._, .vp_, mir_tag, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -121934,7 +121840,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -121976,6 +121882,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -122008,6 +121915,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -122040,6 +121948,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -122077,6 +121986,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122130,6 +122040,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122175,6 +122086,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122220,6 +122132,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -122238,7 +122151,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         }, .{
                             .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{ .{ .int = .qword }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .qword, .is = .qword } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .qword, .is = .qword } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -122264,25 +122177,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                                 .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             } },
-                        }, .{
-                            .required_features = .{ .sse, null, null, null },
-                            .dst_constraints = .{ .{ .int = .xword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .mut_gpr, .none, .none } },
-                                .{ .src = .{ .to_mut_sse, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
-                            .dst_constraints = .{ .{ .int = .xword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mut_mem, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
                         }, .{
                             .required_features = .{ .avx2, null, null, null },
                             .dst_constraints = .{ .{ .int = .xword }, .any },
@@ -122353,7 +122247,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                                 .unused,
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                                 .{ ._, .vp_, mir_tag, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -122408,6 +122302,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -122438,6 +122333,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -122450,7 +122346,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         }, .{
                             .required_features = .{ .sse2, null, null, null },
                             .dst_constraints = .{ .{ .int = .xword }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .xword, .is = .xword } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .xword, .is = .xword } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -122468,6 +122364,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -122503,6 +122400,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122554,6 +122452,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122597,6 +122496,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122640,6 +122540,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -122653,25 +122554,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
                                 .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             } },
-                        }, .{
-                            .required_features = .{ .avx, null, null, null },
-                            .dst_constraints = .{ .{ .int = .yword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .mut_mem, .none, .none } },
-                                .{ .src = .{ .mut_gpr, .none, .none } },
-                                .{ .src = .{ .to_mut_sse, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
-                        }, .{
-                            .dst_constraints = .{ .{ .int = .yword }, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mut_mem, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
                         }, .{
                             .required_features = .{ .avx512f, null, null, null },
                             .dst_constraints = .{ .{ .int = .yword }, .any },
@@ -122679,7 +122561,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
-                            .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                            .dst_temps = .{ .{ .rc = .sse }, .unused },
                             .each = .{ .once = &.{
                                 .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                                 .{ ._, .vp_, mir_tag, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -122687,7 +122569,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         }, .{
                             .required_features = .{ .avx2, null, null, null },
                             .dst_constraints = .{ .{ .int = .yword }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .yword, .is = .yword } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .yword, .is = .yword } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -122705,6 +122587,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -122715,7 +122598,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         }, .{
                             .required_features = .{ .avx, null, null, null },
                             .dst_constraints = .{ .{ .int = .yword }, .any },
-                            .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .yword, .is = .yword } }, .any, .any },
+                            .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .yword, .is = .yword } }, .any, .any },
                             .patterns = &.{
                                 .{ .src = .{ .to_mem, .none, .none } },
                             },
@@ -122733,6 +122616,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -122768,6 +122652,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122817,6 +122702,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122858,6 +122744,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .unused,
                             },
                             .dst_temps = .{ .{ .rc = .sse }, .unused },
+                            .clobbers = .{ .eflags = true },
                             .each = .{ .once = &.{
                                 .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                                 .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -122871,14 +122758,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
                                 .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             } },
-                        }, .{
-                            .dst_constraints = .{ .any_int, .any },
-                            .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                            .patterns = &.{
-                                .{ .src = .{ .to_mut_mem, .none, .none } },
-                            },
-                            .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                            .each = .{ .once = &.{} },
                         }, .{
                             .required_features = .{ .@"64bit", null, null, null },
                             .dst_constraints = .{ .{ .remainder_int = .{ .of = .qword, .is = .qword } }, .any },
@@ -122920,26 +122799,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .Min, .Max => unreachable,
                     .Add => comptime &.{ .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .byte }, .any },
                         .src_constraints = .{ .{ .scalar_int = .{ .of = .word, .is = .byte } }, .any, .any },
@@ -123513,7 +123372,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .vp_b, .add, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -123548,7 +123407,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -123588,6 +123447,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -123626,6 +123486,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -123662,6 +123523,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -123700,6 +123562,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -123744,6 +123607,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -123784,6 +123648,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -123824,6 +123689,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -123846,7 +123712,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     }, .{
                         .required_features = .{ .slow_incdec, null, null, null },
                         .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -123874,7 +123740,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     }, .{
                         .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -123901,25 +123767,16 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .word }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .dword, .is = .word } }, .any, .any },
                         .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
                         },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_w, .add, .dst0x, .src0x, .src0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -123945,6 +123802,17 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp0x, .src0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .src0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .dword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .ph_w, .add, .dst0x, .src0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -123971,6 +123839,18 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .srl, .tmp0x, .ui(16), ._, ._ },
                             .{ ._, .p_w, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_w, .add, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -123998,6 +123878,18 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp0x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .ph_w, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124026,6 +123918,32 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .srl, .tmp0x, .ui(16), ._, ._ },
                             .{ ._, .p_w, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_w, .add, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .vp_d, .shuf, .dst0x, .src0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124053,6 +123971,32 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .dst0x, .src0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .tmp0x, .dst0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .p_d, .shuf, .tmp0x, .src0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, .p_w, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124080,6 +124024,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .srl, .dst0x, .ui(16), ._, ._ },
                             .{ ._, .p_w, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_w, .add, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124109,6 +124066,19 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp0x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .ph_w, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124139,6 +124109,34 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .srl, .tmp0x, .ui(16), ._, ._ },
                             .{ ._, .p_w, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .vp_, .@"and", .dst0x, .src0x, .lea(.tmp0x), ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124170,6 +124168,34 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp2x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .p_, .@"and", .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124202,6 +124228,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .srl, .tmp2x, .ui(16), ._, ._ },
                             .{ ._, .p_w, .add, .dst0x, .tmp2x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_i128, .extract, .dst0x, .src0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .src0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124233,6 +124274,21 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp0x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .dst0x, .src0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .src0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124264,6 +124320,36 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp0x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .vp_, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124297,6 +124383,36 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp2x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124350,7 +124466,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .vp_w, .add, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -124383,7 +124499,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -124400,6 +124516,40 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp2x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .vp_w, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_i128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124421,6 +124571,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -124436,6 +124587,38 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp1x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp1x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .vp_w, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124457,6 +124640,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -124470,6 +124654,38 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp1x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp1x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .p_w, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124491,6 +124707,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -124526,6 +124743,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -124547,6 +124765,42 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp2x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_size), ._, ._ },
+                            .{ ._, .vp_, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ .@"0:", .vp_w, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124568,6 +124822,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -124585,6 +124840,42 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp2x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_u16, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .vp_w, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_w, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124606,6 +124897,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -124623,6 +124915,40 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .srl, .tmp2x, .dst0x, .ui(16), ._ },
                             .{ ._, .vp_w, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, .p_, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", .p_w, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_w, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -124644,6 +124970,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -124662,7 +124989,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     }, .{
                         .dst_constraints = .{ .{ .int = .word }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .word, .is = .word } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .word, .is = .word } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -124689,25 +125016,16 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
                     }, .{
-                        .required_features = .{ .sse, null, null, null },
+                        .required_features = .{ .avx, .fast_hops, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .dword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .dword } }, .any, .any },
                         .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
                         },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_d, .add, .dst0x, .src0x, .src0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124733,6 +125051,17 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp0x, .src0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .src0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .ph_d, .add, .dst0x, .src0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124758,6 +125087,18 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .shuf, .tmp0x, .src0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .p_d, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_d, .add, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124785,6 +125126,18 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp0x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .ph_d, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, .ph_d, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124812,6 +125165,32 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .shuf, .tmp0x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .p_d, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vph_d, .add, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .vp_d, .shuf, .dst0x, .src0x, .ui(0b11_10_11_10), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124839,6 +125218,32 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .dst0x, .src0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .tmp0x, .dst0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .p_d, .shuf, .tmp0x, .src0x, .ui(0b11_10_11_10), ._ },
+                            .{ ._, .ph_d, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, .p_d, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124866,6 +125271,20 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .p_d, .shuf, .dst0x, .src0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .p_d, .add, .dst0x, .tmp0x, ._, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_i128, .extract, .dst0x, .src0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .src0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124895,6 +125314,20 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp0x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .dst0x, .src0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .src0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124924,6 +125357,35 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp0x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .vp_, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -124955,6 +125417,35 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp2x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125006,7 +125497,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .vp_d, .add, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -125037,7 +125529,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -125052,6 +125545,39 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp2x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .vp_d, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_i128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125073,6 +125599,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -125086,6 +125613,37 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp1x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp1x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .vp_d, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125107,6 +125665,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -125118,6 +125677,37 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp1x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp1x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .p_d, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .ph_d, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_d, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125139,6 +125729,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -125171,6 +125762,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -125190,6 +125782,41 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp2x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx2, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_size), ._, ._ },
+                            .{ ._, .vp_, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ .@"0:", .vp_d, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125211,6 +125838,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -125226,6 +125854,41 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp2x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .vp_d, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vph_d, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125262,6 +125925,39 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .vp_d, .shuf, .tmp2x, .dst0x, .ui(0b01_01_01_01), ._ },
                             .{ ._, .vp_d, .add, .dst0x, .dst0x, .tmp2x, ._ },
                         } },
+                    }, .{
+                        .required_features = .{ .ssse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .int = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, .p_, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", .p_d, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .ph_d, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .ph_d, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -125283,6 +125979,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -125298,7 +125995,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     }, .{
                         .dst_constraints = .{ .{ .int = .dword }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .dword, .is = .dword } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .dword, .is = .dword } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -125324,27 +126021,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._, .sub, .tmp0d, .si(4), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
-                    }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .qword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .required_features = .{ .@"64bit", null, null, null },
-                        .dst_constraints = .{ .{ .int = .qword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .qword }, .any },
@@ -125523,7 +126199,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .vp_q, .add, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -125552,7 +126228,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
@@ -125586,6 +126262,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -125618,6 +126295,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -125648,6 +126326,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -125678,6 +126357,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -125716,6 +126396,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -125750,6 +126431,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -125784,6 +126466,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .lea(.tmp0x), ._, ._ },
@@ -125798,7 +126481,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     }, .{
                         .required_features = .{ .@"64bit", null, null, null },
                         .dst_constraints = .{ .{ .int = .qword }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .qword, .is = .qword } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .qword, .is = .qword } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -125824,52 +126507,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
-                    }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .xword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .xword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mut_mem, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .required_features = .{ .avx, null, null, null },
-                        .dst_constraints = .{ .{ .int = .yword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .yword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mut_mem, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .any_int, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mut_mem, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
                     }, .{
                         .required_features = .{ .@"64bit", null, null, null },
                         .dst_constraints = .{ .{ .remainder_int = .{ .of = .qword, .is = .qword } }, .any },
@@ -125910,26 +126547,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     } },
                     .Mul => comptime &.{ .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .byte }, .any },
                         .src_constraints = .{ .{ .scalar_int = .{ .of = .word, .is = .byte } }, .any, .any },
@@ -126528,7 +127145,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .v_dqa, .mov, .tmp0y, .memd(.src0y, 32), ._, ._ },
@@ -126566,7 +127183,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -126610,6 +127227,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -126649,6 +127267,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-24, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -126686,6 +127305,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-24, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -126725,6 +127345,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-24, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -126765,6 +127386,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -126815,6 +127437,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .tmp2y, .lea(.tmp0y), ._, ._ },
@@ -126857,6 +127480,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_ps, .mova, .tmp2y, .lea(.tmp0y), ._, ._ },
@@ -126903,6 +127527,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .tmp2x, .lea(.tmp0x), ._, ._ },
@@ -126946,6 +127571,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .tmp2x, .lea(.tmp0x), ._, ._ },
@@ -126972,7 +127598,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     }, .{
                         .required_features = .{ .slow_incdec, null, null, null },
                         .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -127000,7 +127626,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     }, .{
                         .dst_constraints = .{ .{ .int = .byte }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .byte, .is = .byte } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -127026,26 +127652,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._c, .de, .tmp0d, ._, ._, ._ },
                             .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
                         } },
-                    }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .word }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .word }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .word }, .any },
@@ -127489,7 +128095,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .vp_w, .mull, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -127522,7 +128128,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -127562,6 +128168,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -127598,6 +128205,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -127632,6 +128240,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -127667,6 +128276,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -127711,6 +128321,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .tmp2y, .lea(.tmp0y), ._, ._ },
@@ -127750,6 +128361,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_ps, .mova, .tmp2y, .lea(.tmp0y), ._, ._ },
@@ -127791,6 +128403,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .tmp2x, .lea(.tmp0x), ._, ._ },
@@ -127811,7 +128424,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     }, .{
                         .dst_constraints = .{ .{ .int = .word }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .word, .is = .word } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .word, .is = .word } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -127837,26 +128450,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
-                    }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .dword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .dword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
                         .dst_constraints = .{ .{ .int = .dword }, .any },
@@ -128163,7 +128756,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_dqa, .mov, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .vp_d, .mull, .dst0y, .dst0y, .memd(.src0y, 32), ._ },
@@ -128194,7 +128787,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                         },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -128232,6 +128825,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
@@ -128266,6 +128860,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -128298,6 +128893,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -128330,6 +128926,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
@@ -128365,6 +128962,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .dst0y, .lea(.tmp0y), ._, ._ },
@@ -128407,6 +129005,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_dqa, .mov, .tmp2y, .lea(.tmp0y), ._, ._ },
@@ -128444,6 +129043,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_ps, .mova, .tmp2y, .lea(.tmp0y), ._, ._ },
@@ -128483,6 +129083,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .tmp2x, .lea(.tmp0x), ._, ._ },
@@ -128519,6 +129120,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._dqa, .mov, .tmp2x, .lea(.tmp0x), ._, ._ },
@@ -128539,7 +129141,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } },
                     }, .{
                         .dst_constraints = .{ .{ .int = .dword }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .dword, .is = .dword } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .dword, .is = .dword } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -128565,31 +129167,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._, .sub, .tmp0d, .si(4), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
-                    }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .qword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .required_features = .{ .@"64bit", null, null, null },
-                        .dst_constraints = .{ .{ .int = .qword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .to_mut_gpr, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
                     }, .{
                         .required_features = .{ .@"64bit", null, null, null },
                         .dst_constraints = .{ .{ .int = .qword }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_int = .{ .of = .qword, .is = .qword } }, .any, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_int = .{ .of = .qword, .is = .qword } }, .any, .any },
                         .patterns = &.{
                             .{ .src = .{ .to_mem, .none, .none } },
                         },
@@ -128615,52 +129196,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
-                    }, .{
-                        .required_features = .{ .sse, null, null, null },
-                        .dst_constraints = .{ .{ .int = .xword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .xword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mut_mem, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .required_features = .{ .avx, null, null, null },
-                        .dst_constraints = .{ .{ .int = .yword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .mut_mem, .none, .none } },
-                            .{ .src = .{ .mut_gpr, .none, .none } },
-                            .{ .src = .{ .to_mut_sse, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .{ .int = .yword }, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mut_mem, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
-                    }, .{
-                        .dst_constraints = .{ .any_int, .any },
-                        .src_constraints = .{ .{ .vec_len = 1 }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mut_mem, .none, .none } },
-                        },
-                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
-                        .each = .{ .once = &.{} },
                     }, .{
                         .required_features = .{ .@"64bit", .bmi2, .adx, null },
                         .dst_constraints = .{ .{ .remainder_int = .{ .of = .qword, .is = .qword } }, .any },
@@ -128937,6 +129472,2289 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 }
                 try res[0].finish(inst, &.{reduce.operand}, &ops, cg);
             },
+            .reduce_optimized => |air_tag| if (use_old) try cg.airReduce(inst) else fallback: {
+                const reduce = air_datas[@intFromEnum(inst)].reduce;
+                switch (reduce.operation) {
+                    .And, .Or, .Xor => unreachable,
+                    .Min, .Max => break :fallback try cg.airReduce(inst),
+                    .Add => {},
+                    .Mul => break :fallback try cg.airReduce(inst),
+                }
+                var ops = try cg.tempsFromOperands(inst, .{reduce.operand});
+                var res: [1]Temp = undefined;
+                cg.select(&res, &.{cg.typeOfIndex(inst)}, &ops, switch (reduce.operation) {
+                    .And, .Or, .Xor => unreachable,
+                    .Min, .Max => unreachable,
+                    .Add => comptime &.{ .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .dword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .dword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .vh_ps, .add, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .shuf, .dst0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ss, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vh_ps, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0x, .src0x, .lea(.tmp0x), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0x, .src0x, .lea(.tmp0x), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp0y, .tmp0x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp0y, .tmp0x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, .f16c, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .memd(.src0y, 32), ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .tmp2y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .add, .tmp2y, .tmp2y, .tmp3y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, .f16c, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .mova, .tmp1y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .tmp1y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp1y, .tmp1x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .add, .tmp1y, .tmp1y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp1y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp1y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp1y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp1y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp1y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, .f16c, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-80, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .tmp2y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .add, .tmp2y, .tmp2y, .tmp3y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp2y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp2y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp2y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addhf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, .vp_, .xor, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .vp_w, .insr, .dst0x, .dst0x, .memad(.src0w, .add_unaligned_size, -2), .ui(0) },
+                            .{ .@"0:", .vp_, .xor, .tmp1x, .tmp1x, .tmp1x, ._ },
+                            .{ ._, .vp_w, .insr, .tmp1x, .tmp1x, .memi(.src0w, .tmp0), .ui(0) },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addhf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, .p_, .xor, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .p_w, .insr, .dst0x, .memad(.src0w, .add_unaligned_size, -2), .ui(0), ._ },
+                            .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
+                            .{ ._, .p_w, .insr, .tmp1x, .memi(.src0w, .tmp0), .ui(0), ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f16, .kind = .{ .reg = .ax } },
+                            .{ .type = .f32, .kind = .mem },
+                            .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addhf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._, .movzx, .tmp1d, .memad(.src0w, .add_unaligned_size, -2), ._, ._ },
+                            .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ },
+                            .{ ._, ._ss, .mov, .dst0x, .mem(.tmp2d), ._, ._ },
+                            .{ .@"0:", ._ps, .xor, .tmp3x, .tmp3x, ._, ._ },
+                            .{ ._, ._ss, .mov, .tmp3x, .memi(.src0d, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp4d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_ps, .add, .dst0x, .src0x, .src0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .shuf, .tmp0x, .src0x, .src0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .src0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .h_ps, .add, .dst0x, .src0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .mova, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp0x, .tmp0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_ps, .add, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_ps, .movhl, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_ps, .add, .tmp0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .dst0x, .src0x, .src0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, .h_ps, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, ._ss, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ss, .add, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .dst0x, .src0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_ps, .add, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .vh_ps, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .src0x, .src0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .src0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .h_ps, .add, .dst0x, .src0x, ._, ._ },
+                            .{ ._, .h_ps, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ps, .add, .dst0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .mova, .tmp0x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp0x, .tmp0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .src0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp1x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .src0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp2y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_ps, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .vh_ps, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_ps, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .h_ps, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .h_ps, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .add, .dst0x, .tmp1x, ._, ._ },
+                            .{ ._, ._ps, .mova, .tmp1x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp1x, .tmp1x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .add, .dst0x, .tmp1x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
+                            .{ .@"0:", .v_ps, .add, .tmp2y, .tmp2y, .memid(.src0y, .tmp0, 32), ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .v_ps, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_ps, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .vh_ps, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .v_ps, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp2x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._ps, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .h_ps, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .h_ps, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._ps, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp2x, .tmp2x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp2x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .add, .dst0x, .tmp2x, ._, ._ },
+                            .{ ._, ._ps, .mova, .tmp2x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp2x, .tmp2x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .add, .dst0x, .tmp2x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_pd, .add, .dst0x, .src0x, .src0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .src0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .h_pd, .add, .dst0x, .src0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._sd, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_pd, .add, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_f128, .extract, .dst0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .tmp0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_pd, .add, .dst0y, .src0y, .src0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp2y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_pd, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .h_pd, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ },
+                            .{ ._, ._sd, .add, .dst0x, .tmp1x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
+                            .{ .@"0:", .v_pd, .add, .tmp2y, .tmp2y, .memid(.src0y, .tmp0, 32), ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_pd, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .h_pd, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp2x, .tmp2x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp2x, .dst0x, ._, ._ },
+                            .{ ._, ._pd, .add, .dst0x, .tmp2x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .x87, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f64, .kind = .{ .reg = .st7 } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .mem, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0p, .sia(-16, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .f_, .ld, .memad(.src0q, .add_unaligned_size, -8), ._, ._, ._ },
+                            .{ .@"0:", .f_, .add, .memi(.src0q, .tmp0), ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0p, .si(8), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .f_p, .st, .dst0q, ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .x87, null, null, null },
+                        .dst_constraints = .{ .{ .float = .tbyte }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .qword, .is = .tbyte } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                            .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .mem, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0p, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .f_, .ld, .memad(.src0t, .add_unaligned_size, -16), ._, ._, ._ },
+                            .{ .@"0:", .f_, .ld, .memi(.src0t, .tmp0), ._, ._, ._ },
+                            .{ ._, .f_p, .add, ._, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0p, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .f_p, .st, .dst0t, ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .xword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .v_dqa, .mov, .tmp1x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .xword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._dqa, .mov, .tmp1x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .xword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .mova, .tmp1x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    } },
+                    .Mul => unreachable,
+                }) catch |err| switch (err) {
+                    error.SelectFailed => return cg.fail("failed to select {s} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(reduce.operand).fmt(pt),
+                        ops[0].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
+                try res[0].finish(inst, &.{reduce.operand}, &ops, cg);
+            },
             .splat => |air_tag| if (use_old) try cg.airSplat(inst) else fallback: {
                 const ty_op = air_datas[@intFromEnum(inst)].ty_op;
                 if (cg.typeOf(ty_op.operand).toIntern() == .bool_type) break :fallback try cg.airSplat(inst);
@@ -137624,7 +140442,7 @@ fn genByteSwap(
             return src_mcv;
         },
         3...8 => if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
-            try self.genUnOpMir(.{ ._, .bswap }, src_ty, src_mcv);
+            try self.genUnOpMir(.{ .b_, .swap }, src_ty, src_mcv);
             return src_mcv;
         },
         9...16 => {
@@ -137635,7 +140453,7 @@ fn genByteSwap(
                     break :mat_src_mcv .{ .load_frame = .{ .index = frame_index } };
                 },
                 .register_pair => |src_regs| if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
-                    for (src_regs) |src_reg| try self.asmRegister(.{ ._, .bswap }, src_reg.to64());
+                    for (src_regs) |src_reg| try self.asmRegister(.{ .b_, .swap }, src_reg.to64());
                     return .{ .register_pair = .{ src_regs[1], src_regs[0] } };
                 } else src_mcv,
                 else => src_mcv,
@@ -137649,18 +140467,18 @@ fn genByteSwap(
             for (dst_regs, 0..) |dst_reg, limb_index| {
                 if (mat_src_mcv.isBase()) {
                     try self.asmRegisterMemory(
-                        .{ ._, if (has_movbe) .movbe else .mov },
+                        .{ if (has_movbe) ._be else ._, .mov },
                         dst_reg.to64(),
                         try mat_src_mcv.address().offset(@intCast(limb_index * 8)).deref().mem(self, .{ .size = .qword }),
                     );
-                    if (!has_movbe) try self.asmRegister(.{ ._, .bswap }, dst_reg.to64());
+                    if (!has_movbe) try self.asmRegister(.{ .b_, .swap }, dst_reg.to64());
                 } else {
                     try self.asmRegisterRegister(
                         .{ ._, .mov },
                         dst_reg.to64(),
                         mat_src_mcv.register_pair[limb_index].to64(),
                     );
-                    try self.asmRegister(.{ ._, .bswap }, dst_reg.to64());
+                    try self.asmRegister(.{ .b_, .swap }, dst_reg.to64());
                 }
             }
             return .{ .register_pair = .{ dst_regs[1], dst_regs[0] } };
@@ -137679,7 +140497,7 @@ fn genByteSwap(
 
             const loop: Mir.Inst.Index = @intCast(self.mir_instructions.len);
             try self.asmRegisterMemory(
-                .{ ._, if (has_movbe) .movbe else .mov },
+                .{ if (has_movbe) ._be else ._, .mov },
                 temp_regs[2].to64(),
                 .{
                     .base = .{ .frame = dst_mcv.load_frame.index },
@@ -137692,7 +140510,7 @@ fn genByteSwap(
                 },
             );
             try self.asmRegisterMemory(
-                .{ ._, if (has_movbe) .movbe else .mov },
+                .{ if (has_movbe) ._be else ._, .mov },
                 temp_regs[3].to64(),
                 .{
                     .base = .{ .frame = dst_mcv.load_frame.index },
@@ -137705,8 +140523,8 @@ fn genByteSwap(
                 },
             );
             if (!has_movbe) {
-                try self.asmRegister(.{ ._, .bswap }, temp_regs[2].to64());
-                try self.asmRegister(.{ ._, .bswap }, temp_regs[3].to64());
+                try self.asmRegister(.{ .b_, .swap }, temp_regs[2].to64());
+                try self.asmRegister(.{ .b_, .swap }, temp_regs[3].to64());
             }
             try self.asmMemoryRegister(.{ ._, .mov }, .{
                 .base = .{ .frame = dst_mcv.load_frame.index },
@@ -137751,9 +140569,9 @@ fn genByteSwap(
         switch (abi_size) {
             else => unreachable,
             2 => try self.genBinOpMir(.{ ._l, .ro }, src_ty, dst_mcv, .{ .immediate = 8 }),
-            3...8 => try self.genUnOpMir(.{ ._, .bswap }, src_ty, dst_mcv),
+            3...8 => try self.genUnOpMir(.{ .b_, .swap }, src_ty, dst_mcv),
         }
-    } else try self.genBinOpMir(.{ ._, .movbe }, src_ty, dst_mcv, src_mcv);
+    } else try self.genBinOpMir(.{ ._be, .mov }, src_ty, dst_mcv, src_mcv);
     return dst_mcv;
 }
 
@@ -145626,16 +148444,16 @@ fn airAsm(self: *CodeGen, inst: Air.Inst.Index) !void {
             .{ ._, .pseudo }
         else for (std.enums.values(Mir.Inst.Fixes)) |fixes| {
             const fixes_name = @tagName(fixes);
-            const space_i = std.mem.indexOfScalar(u8, fixes_name, ' ');
-            const fixes_prefix = if (space_i) |i|
-                std.meta.stringToEnum(encoder.Instruction.Prefix, fixes_name[0..i]).?
+            const space_index = std.mem.indexOfScalar(u8, fixes_name, ' ');
+            const fixes_prefix = if (space_index) |index|
+                std.meta.stringToEnum(encoder.Instruction.Prefix, fixes_name[0..index]).?
             else
                 .none;
             if (fixes_prefix != prefix) continue;
-            const pattern = fixes_name[if (space_i) |i| i + " ".len else 0..];
-            const wildcard_i = std.mem.indexOfScalar(u8, pattern, '_').?;
-            const mnem_prefix = pattern[0..wildcard_i];
-            const mnem_suffix = pattern[wildcard_i + "_".len ..];
+            const pattern = fixes_name[if (space_index) |index| index + " ".len else 0..];
+            const wildcard_index = std.mem.indexOfScalar(u8, pattern, '_').?;
+            const mnem_prefix = pattern[0..wildcard_index];
+            const mnem_suffix = pattern[wildcard_index + "_".len ..];
             if (!std.mem.startsWith(u8, mnem_name, mnem_prefix)) continue;
             if (!std.mem.endsWith(u8, mnem_name, mnem_suffix)) continue;
             break .{ fixes, std.meta.stringToEnum(
@@ -157348,7 +160166,6 @@ const Select = struct {
         bool,
         bool_vec: Memory.Size,
         exact_bool_vec: u16,
-        vec_len: u32,
         ptr_any_bool_vec,
         ptr_bool_vec: Memory.Size,
         remainder_bool_vec: OfIsSizes,
@@ -157390,8 +160207,11 @@ const Select = struct {
         float: Memory.Size,
         scalar_any_float: Memory.Size,
         scalar_float: OfIsSizes,
+        exclusive_scalar_float: OfIsSizes,
+        exact_scalar_float: OfIsSizes,
         multiple_scalar_any_float: Memory.Size,
         multiple_scalar_float: OfIsSizes,
+        unaligned_multiple_scalar_float: OfIsSizes,
         exact_int: u16,
         exact_signed_int: u16,
         exact_unsigned_int: u16,
@@ -157435,7 +160255,6 @@ const Select = struct {
                     size.bitSize(cg.target) >= ty.vectorLen(zcu),
                 .exact_bool_vec => |size| ty.isVector(zcu) and ty.scalarType(zcu).toIntern() == .bool_type and
                     size == ty.vectorLen(zcu),
-                .vec_len => |len| ty.isVector(zcu) and ty.vectorLen(zcu) == len,
                 .ptr_any_bool_vec => switch (zcu.intern_pool.indexToKey(ty.childType(zcu).toIntern())) {
                     .vector_type => |vector_type| vector_type.child == .bool_type,
                     else => false,
@@ -157559,10 +160378,16 @@ const Select = struct {
                     cg.floatBits(ty.scalarType(zcu)) != null,
                 .scalar_float => |of_is| @divExact(of_is.of.bitSize(cg.target), 8) >= cg.unalignedSize(ty) and
                     if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) == float_bits else false,
+                .exclusive_scalar_float => |of_is| @divExact(of_is.of.bitSize(cg.target), 8) > cg.unalignedSize(ty) and
+                    if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) == float_bits else false,
+                .exact_scalar_float => |of_is| @divExact(of_is.of.bitSize(cg.target), 8) == cg.unalignedSize(ty) and
+                    if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) == float_bits else false,
                 .multiple_scalar_any_float => |size| ty.abiSize(zcu) % @divExact(size.bitSize(cg.target), 8) == 0 and
                     cg.floatBits(ty.scalarType(zcu)) != null,
                 .multiple_scalar_float => |of_is| ty.abiSize(zcu) % @divExact(of_is.of.bitSize(cg.target), 8) == 0 and
                     if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) == float_bits else false,
+                .unaligned_multiple_scalar_float => |of_is| cg.unalignedSize(ty) % @divExact(of_is.of.bitSize(cg.target), 8) == 0 and
+                    if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) == float_bits else false,
                 .exact_int => |bit_size| if (cg.intInfo(ty)) |int_info| bit_size == int_info.bits else false,
                 .exact_signed_int => |bit_size| if (cg.intInfo(ty)) |int_info| switch (int_info.signedness) {
                     .signed => bit_size == int_info.bits,
src/arch/x86_64/Encoding.zig
@@ -313,7 +313,7 @@ pub const Mnemonic = enum {
     @"or", out, outs, outsb, outsd, outsw,
     pause, pop, popf, popfd, popfq, push, pushfq,
     rcl, rcr,
-    rdfsbase, rdgsbase, rdmsr, rdpid, rdpkru, rdpmc, rdrand, rdseed, rdssd, rdssq, rdtsc, rdtscp,
+    rdfsbase, rdgsbase, rdmsr, rdpid, rdpkru, rdpmc, rdrand, rdseed, rdsspd, rdsspq, rdtsc, rdtscp,
     ret, rol, ror, rsm,
     sahf, sal, sar, sbb,
     scas, scasb, scasd, scasq, scasw,
@@ -436,6 +436,7 @@ pub const Mnemonic = enum {
     pblendvb, pblendw,
     pcmpeqq,
     pextrb, pextrd, pextrq,
+    phminposuw,
     pinsrb, pinsrd, pinsrq,
     pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw,
     pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
@@ -494,19 +495,19 @@ pub const Mnemonic = enum {
     vpblendvb, vpblendw, vpclmulqdq,
     vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
     vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
-    vphaddw, vphaddsw, vphaddd, vphsubw, vphsubsw, vphsubd,
     vperm2f128, vpermilpd, vpermilps,
     vpextrb, vpextrd, vpextrq, vpextrw,
+    vphaddw, vphaddsw, vphaddd, vphminposuw, vphsubw, vphsubsw, vphsubd,
     vpinsrb, vpinsrd, vpinsrq, vpinsrw,
+    vpmaddubsw, vpmaddwd,
     vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
     vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw,
-    vpmaddubsw,
     vpmovmskb,
     vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq,
     vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq,
-    vpmuldq, vpmulhrsw, vpmulhw, vpmulld, vpmullw, vpmuludq,
+    vpmuldq, vpmulhrsw, vpmulhuw, vpmulhw, vpmulld, vpmullw, vpmuludq,
     vpor,
-    vpshufb, vpshufd, vpshufhw, vpshuflw,
+    vpsadbw, vpshufb, vpshufd, vpshufhw, vpshuflw,
     vpsignb, vpsignd, vpsignw,
     vpslld, vpslldq, vpsllq, vpsllw,
     vpsrad, vpsraq, vpsraw,
@@ -1029,7 +1030,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
 }
 
 const mnemonic_to_encodings_map = init: {
-    @setEvalBranchQuota(5_800);
+    @setEvalBranchQuota(5_900);
     const ModrmExt = u3;
     const Entry = struct { Mnemonic, OpEn, []const Op, []const u8, ModrmExt, Mode, Feature };
     const encodings: []const Entry = @import("encodings.zon");
@@ -1038,17 +1039,17 @@ const mnemonic_to_encodings_map = init: {
     var mnemonic_map: [mnemonic_count][]Data = @splat(&.{});
     for (encodings) |entry| mnemonic_map[@intFromEnum(entry[0])].len += 1;
     var data_storage: [encodings.len]Data = undefined;
-    var storage_i: usize = 0;
+    var storage_index: usize = 0;
     for (&mnemonic_map) |*value| {
-        value.ptr = data_storage[storage_i..].ptr;
-        storage_i += value.len;
+        value.ptr = data_storage[storage_index..].ptr;
+        storage_index += value.len;
     }
-    var mnemonic_i: [mnemonic_count]usize = @splat(0);
+    var mnemonic_index: [mnemonic_count]usize = @splat(0);
     const ops_len = @typeInfo(@FieldType(Data, "ops")).array.len;
     const opc_len = @typeInfo(@FieldType(Data, "opc")).array.len;
     for (encodings) |entry| {
-        const i = &mnemonic_i[@intFromEnum(entry[0])];
-        mnemonic_map[@intFromEnum(entry[0])][i.*] = .{
+        const index = &mnemonic_index[@intFromEnum(entry[0])];
+        mnemonic_map[@intFromEnum(entry[0])][index.*] = .{
             .op_en = entry[1],
             .ops = (entry[2] ++ .{.none} ** (ops_len - entry[2].len)).*,
             .opc_len = entry[3].len,
@@ -1057,14 +1058,14 @@ const mnemonic_to_encodings_map = init: {
             .mode = entry[5],
             .feature = entry[6],
         };
-        i.* += 1;
+        index.* += 1;
     }
     const final_storage = data_storage;
     var final_map: [mnemonic_count][]const Data = @splat(&.{});
-    storage_i = 0;
+    storage_index = 0;
     for (&final_map, mnemonic_map) |*final_value, value| {
-        final_value.* = final_storage[storage_i..][0..value.len];
-        storage_i += value.len;
+        final_value.* = final_storage[storage_index..][0..value.len];
+        storage_index += value.len;
     }
     break :init final_map;
 };
src/arch/x86_64/encodings.zon
@@ -684,8 +684,8 @@
     .{ .rdseed, .m, .{ .r32 }, .{ 0x0f, 0xc7 }, 7, .none,  .rdseed },
     .{ .rdseed, .m, .{ .r64 }, .{ 0x0f, 0xc7 }, 7, .long,  .rdseed },
 
-    .{ .rdssd, .m, .{ .r32 }, .{ 0xf3, 0x0f, 0x1e }, 1, .none, .shstk },
-    .{ .rdssq, .m, .{ .r64 }, .{ 0xf3, 0x0f, 0x1e }, 1, .long, .shstk },
+    .{ .rdsspd, .m, .{ .r32 }, .{ 0xf3, 0x0f, 0x1e }, 1, .none, .shstk },
+    .{ .rdsspq, .m, .{ .r64 }, .{ 0xf3, 0x0f, 0x1e }, 1, .long, .shstk },
 
     .{ .rdtsc, .z, .{}, .{ 0x0f, 0x31 }, 0, .none, .none },
 
@@ -1524,6 +1524,8 @@
 
     .{ .pinsrw, .rmi, .{ .xmm, .r32_m16, .imm8 }, .{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 },
 
+    .{ .pmaddwd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf5 }, 0, .none, .sse2 },
+
     .{ .pmaxsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xee }, 0, .none, .sse2 },
 
     .{ .pmaxub, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xde }, 0, .none, .sse2 },
@@ -1532,6 +1534,8 @@
 
     .{ .pminub, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xda }, 0, .none, .sse2 },
 
+    .{ .pmulhuw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe4 }, 0, .none, .sse2 },
+
     .{ .pmulhw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe5 }, 0, .none, .sse2 },
 
     .{ .pmullw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 },
@@ -1540,6 +1544,8 @@
 
     .{ .por, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 },
 
+    .{ .psadbw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf6 }, 0, .none, .sse2 },
+
     .{ .pshufd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .none, .sse2 },
 
     .{ .pshufhw, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0xf3, 0x0f, 0x70 }, 0, .none, .sse2 },
@@ -1642,8 +1648,26 @@
 
     .{ .palignr, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .none, .ssse3 },
 
+    .{ .phaddw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x01 }, 0, .none, .ssse3 },
+    .{ .phaddd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x02 }, 0, .none, .ssse3 },
+
+    .{ .phaddsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x03 }, 0, .none, .ssse3 },
+
+    .{ .phsubw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x05 }, 0, .none, .ssse3 },
+    .{ .phsubd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x06 }, 0, .none, .ssse3 },
+
+    .{ .phsubsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x07 }, 0, .none, .ssse3 },
+
+    .{ .pmaddubsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x04 }, 0, .none, .ssse3 },
+
+    .{ .pmulhrsw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0b }, 0, .none, .ssse3 },
+
     .{ .pshufb, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .none, .ssse3 },
 
+    .{ .psignb, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x08 }, 0, .none, .ssse3 },
+    .{ .psignw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x09 }, 0, .none, .ssse3 },
+    .{ .psignd, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0a }, 0, .none, .ssse3 },
+
     // SSE4.1
     .{ .blendpd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 },
 
@@ -1678,6 +1702,8 @@
 
     .{ .pextrw, .mri, .{ .r32_m16, .xmm, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },
 
+    .{ .phminposuw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x41 }, 0, .none, .sse4_1 },
+
     .{ .pinsrb, .rmi, .{ .xmm, .r32_m8, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 },
     .{ .pinsrd, .rmi, .{ .xmm, .rm32,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 },
     .{ .pinsrq, .rmi, .{ .xmm, .rm64,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 },
@@ -2129,12 +2155,28 @@
     .{ .vpextrw, .rmi, .{ .r32,     .xmm, .imm8 }, .{ 0x66, 0x0f,       0xc5 }, 0, .vex_128_w0, .avx },
     .{ .vpextrw, .mri, .{ .r32_m16, .xmm, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_w0, .avx },
 
+    .{ .vphaddw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x01 }, 0, .vex_128_wig, .avx },
+    .{ .vphaddd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x02 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphaddsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x03 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphminposuw, .rm, .{ .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x41 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphsubw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x05 }, 0, .vex_128_wig, .avx },
+    .{ .vphsubd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x06 }, 0, .vex_128_wig, .avx },
+
+    .{ .vphsubsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x07 }, 0, .vex_128_wig, .avx },
+
     .{ .vpinsrb, .rvmi, .{ .xmm, .xmm, .r32_m8, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
     .{ .vpinsrd, .rvmi, .{ .xmm, .xmm, .rm32,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
     .{ .vpinsrq, .rvmi, .{ .xmm, .xmm, .rm64,   .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },
 
     .{ .vpinsrw, .rvmi, .{ .xmm, .xmm, .r32_m16, .imm8 }, .{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_w0, .avx },
 
+    .{ .vpmaddubsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x04 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmaddwd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf5 }, 0, .vex_128_wig, .avx },
+
     .{ .vpmaxsb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_128_wig, .avx },
     .{ .vpmaxsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f,       0xee }, 0, .vex_128_wig, .avx },
     .{ .vpmaxsd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_128_wig, .avx },
@@ -2172,6 +2214,10 @@
 
     .{ .vpmuldq, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_128_wig, .avx },
 
+    .{ .vpmulhrsw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0b }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmulhuw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe4 }, 0, .vex_128_wig, .avx },
+
     .{ .vpmulhw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx },
 
     .{ .vpmulld, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx },
@@ -2182,6 +2228,8 @@
 
     .{ .vpor, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },
 
+    .{ .vpsadbw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf6 }, 0, .vex_128_wig, .avx },
+
     .{ .vpshufb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx },
 
     .{ .vpshufd, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
@@ -2190,6 +2238,10 @@
 
     .{ .vpshuflw, .rmi, .{ .xmm, .xmm_m128, .imm8 }, .{ 0xf2, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
 
+    .{ .vpsignb, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x08 }, 0, .vex_128_wig, .avx },
+    .{ .vpsignw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x09 }, 0, .vex_128_wig, .avx },
+    .{ .vpsignd, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0x38, 0x0a }, 0, .vex_128_wig, .avx },
+
     .{ .vpsllw, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf1 }, 0, .vex_128_wig, .avx },
     .{ .vpsllw, .vmi, .{ .xmm, .xmm, .imm8     }, .{ 0x66, 0x0f, 0x71 }, 6, .vex_128_wig, .avx },
     .{ .vpslld, .rvm, .{ .xmm, .xmm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf2 }, 0, .vex_128_wig, .avx },
@@ -2447,6 +2499,16 @@
 
     .{ .vpcmpgtq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vphaddw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x01 }, 0, .vex_256_wig, .avx2 },
+    .{ .vphaddd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x02 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vphaddsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x03 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vphaddw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x05 }, 0, .vex_256_wig, .avx2 },
+    .{ .vphaddd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x06 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vphaddsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x07 }, 0, .vex_256_wig, .avx2 },
+
     .{ .vperm2i128, .rvmi, .{ .ymm, .ymm, .ymm_m256, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x46 }, 0, .vex_256_w0, .avx2 },
 
     .{ .vpermd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x36 }, 0, .vex_256_w0, .avx2 },
@@ -2457,6 +2519,10 @@
 
     .{ .vpermq, .rmi, .{ .ymm, .ymm_m256, .imm8 }, .{ 0x66, 0x0f, 0x3a, 0x00 }, 0, .vex_256_w1, .avx2 },
 
+    .{ .vpmaddubsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x04 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmaddwd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf5 }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpmaskmovd, .rvm, .{ .xmm,  .xmm, .m128 }, .{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w0, .avx2 },
     .{ .vpmaskmovd, .rvm, .{ .ymm,  .ymm, .m256 }, .{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w0, .avx2 },
     .{ .vpmaskmovq, .rvm, .{ .xmm,  .xmm, .m128 }, .{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w1, .avx2 },
@@ -2503,6 +2569,10 @@
 
     .{ .vpmuldq, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x28 }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpmulhrsw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x0b }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmulhuw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xe4 }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpmulhw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 },
 
     .{ .vpmulld, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 },
@@ -2513,6 +2583,8 @@
 
     .{ .vpor, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpsadbw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0xf6 }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpshufb, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 },
     .{ .vpshufd, .rmi, .{ .ymm, .ymm_m256, .imm8 }, .{ 0x66, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
 
@@ -2520,6 +2592,10 @@
 
     .{ .vpshuflw, .rmi, .{ .ymm, .ymm_m256, .imm8 }, .{ 0xf2, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpsignb, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x08 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsignw, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x09 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsignd, .rvm, .{ .ymm, .ymm, .ymm_m256 }, .{ 0x66, 0x0f, 0x38, 0x0a }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpsllw, .rvm, .{ .ymm, .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf1 }, 0, .vex_256_wig, .avx2 },
     .{ .vpsllw, .vmi, .{ .ymm, .ymm, .imm8     }, .{ 0x66, 0x0f, 0x71 }, 6, .vex_256_wig, .avx2 },
     .{ .vpslld, .rvm, .{ .ymm, .ymm, .xmm_m128 }, .{ 0x66, 0x0f, 0xf2 }, 0, .vex_256_wig, .avx2 },
src/arch/x86_64/Lower.zig
@@ -567,7 +567,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
 }
 
 fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
-    @setEvalBranchQuota(2_600);
+    @setEvalBranchQuota(2_800);
     const fixes = switch (inst.ops) {
         .none => inst.data.none.fixes,
         .inst => inst.data.inst.fixes,
@@ -601,9 +601,9 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
         var buf: [max_len]u8 = undefined;
 
         const fixes_name = @tagName(fixes);
-        const pattern = fixes_name[if (std.mem.indexOfScalar(u8, fixes_name, ' ')) |i| i + 1 else 0..];
-        const wildcard_i = std.mem.indexOfScalar(u8, pattern, '_').?;
-        const parts = .{ pattern[0..wildcard_i], @tagName(inst.tag), pattern[wildcard_i + 1 ..] };
+        const pattern = fixes_name[if (std.mem.indexOfScalar(u8, fixes_name, ' ')) |i| i + " ".len else 0..];
+        const wildcard_index = std.mem.indexOfScalar(u8, pattern, '_').?;
+        const parts = .{ pattern[0..wildcard_index], @tagName(inst.tag), pattern[wildcard_index + "_".len ..] };
         const err_msg = "unsupported mnemonic: ";
         const mnemonic = std.fmt.bufPrint(&buf, "{s}{s}{s}", parts) catch
             return lower.fail(err_msg ++ "'{s}{s}{s}'", parts);
src/arch/x86_64/Mir.zig
@@ -34,6 +34,16 @@ pub const Inst = struct {
         /// ___ 4
         _4,
 
+        /// ___ Demote
+        _demote,
+        /// ___ Flush
+        _flush,
+        /// ___ Flush Optimized
+        _flushopt,
+        /// ___ Instructions With T0 Hint
+        _it0,
+        /// ___ Instructions With T0 Hint
+        _it1,
         /// ___ With NTA Hint
         _nta,
         /// System Call ___
@@ -44,6 +54,8 @@ pub const Inst = struct {
         _t1,
         /// ___ With T2 Hint
         _t2,
+        /// ___ Write Back
+        _wb,
         /// ___ With Intent to Write and T1 Hint
         _wt1,
 
@@ -53,6 +65,8 @@ pub const Inst = struct {
         _csspq,
         /// ___ FS Segment Base
         _fsbase,
+        /// ___ GS
+        _gs,
         /// ___ GS Segment Base
         _gsbase,
         /// ___ Model Specific Register
@@ -67,8 +81,14 @@ pub const Inst = struct {
         _pmc,
         /// ___ Random Number
         _rand,
+        /// ___ r Busy Flag in a Supervisor Shadow Stack token
+        _rssbsy,
         /// ___ Random Seed
         _seed,
+        /// ___ Shadow Stack Doubleword
+        _ssd,
+        /// ___ Shadow Stack Quadword
+        _ssq,
         /// ___ Shadow Stack Pointer Doubleword
         _sspd,
         /// ___ Shadow Stack Pointer Quadword
@@ -77,9 +97,15 @@ pub const Inst = struct {
         _tsc,
         /// ___ Time-Stamp Counter And Processor ID
         _tscp,
+        /// ___ User Shadow Stack Doubleword
+        _ussd,
+        /// ___ User Shadow Stack Quadword
+        _ussq,
         /// VEX-Encoded ___ MXCSR
         v_mxcsr,
 
+        /// Byte ___
+        b_,
         /// Interrupt ___
         /// Integer ___
         i_,
@@ -118,6 +144,8 @@ pub const Inst = struct {
         _ld,
         /// ___ Left Without Affecting Flags
         _lx,
+        /// ___ Mask
+        _msk,
         /// ___ Right
         /// ___ For Reading
         /// ___ Register
@@ -139,6 +167,7 @@ pub const Inst = struct {
         /// ___ Below
         _b,
         /// ___ Below Or Equal
+        /// ___ Big Endian
         _be,
         /// ___ Carry
         /// ___ Carry Flag
@@ -212,8 +241,12 @@ pub const Inst = struct {
         _w,
         /// ___ Doubleword
         //_d,
+        /// ___ Double Quadword to Quadword
+        _dq2q,
         /// ___ QuadWord
         _q,
+        /// ___ Quadword to Double Quadword
+        _q2dq,
 
         /// ___ String
         //_s,
@@ -369,6 +402,8 @@ pub const Inst = struct {
         fn_sw,
         /// Float Extended ___
         fx_,
+        /// Float Extended ___ 64
+        fx_64,
 
         /// ___ in 32-bit and Compatibility Mode
         _32,
@@ -390,6 +425,14 @@ pub const Inst = struct {
         p_dq,
         /// Packed ___ Unsigned Doubleword to Quadword
         p_udq,
+        /// Packed Carry-Less ___ Quadword to Double Quadword
+        pcl_qdq,
+        /// Packed Half ___ Doubleword
+        ph_d,
+        /// Packed Half ___ Saturate Word
+        ph_sw,
+        /// Packed Half ___ Word
+        ph_w,
         /// ___ Aligned Packed Integer Values
         _dqa,
         /// ___ Unaligned Packed Integer Values
@@ -403,6 +446,10 @@ pub const Inst = struct {
         //_sd,
         /// ___ Packed Double-Precision Values
         _pd,
+        /// Half ___ Packed Single-Precision Values
+        h_ps,
+        /// Half ___ Packed Double-Precision Values
+        h_pd,
 
         /// ___ Internal Caches
         //_d,
@@ -430,7 +477,7 @@ pub const Inst = struct {
         v_w,
         /// VEX-Encoded ___ Doubleword
         v_d,
-        /// VEX-Encoded ___ QuadWord
+        /// VEX-Encoded ___ Quadword
         v_q,
         /// VEX-Encoded ___ Aligned Packed Integer Values
         v_dqa,
@@ -453,6 +500,14 @@ pub const Inst = struct {
         vp_dq,
         /// VEX-Encoded Packed ___ Unsigned Doubleword to Quadword
         vp_udq,
+        /// VEx-Encoded Packed Carry-Less ___ Quadword to Double Quadword
+        vpcl_qdq,
+        /// VEX-Encoded Packed Half ___ Doubleword
+        vph_d,
+        /// VEX-Encoded Packed Half ___ Saturate Word
+        vph_sw,
+        /// VEX-Encoded Packed Half ___ Word
+        vph_w,
         /// VEX-Encoded ___ Scalar Single-Precision Values
         v_ss,
         /// VEX-Encoded ___ Packed Single-Precision Values
@@ -463,6 +518,10 @@ pub const Inst = struct {
         v_pd,
         /// VEX-Encoded ___ 128-Bits Of Floating-Point Data
         v_f128,
+        /// VEX-Encoded Half ___ Packed Single-Precision Values
+        vh_ps,
+        /// VEX-Encoded Half ___ Packed Double-Precision Values
+        vh_pd,
 
         /// ___ 128-bit key with key locker
         _128,
@@ -510,6 +569,10 @@ pub const Inst = struct {
         /// Add scalar single-precision floating-point values
         /// Add packed double-precision floating-point values
         /// Add scalar double-precision floating-point values
+        /// Packed single-precision floating-point horizontal add
+        /// Packed double-precision floating-point horizontal add
+        /// Packed horizontal add
+        /// Packed horizontal add and saturate
         add,
         /// Logical and
         /// Bitwise logical and of packed single-precision floating-point values
@@ -521,12 +584,15 @@ pub const Inst = struct {
         /// Bit scan reverse
         bs,
         /// Byte swap
-        bswap,
+        /// Swap GS base register
+        swap,
         /// Bit test
         /// Bit test and complement
         /// Bit test and reset
         /// Bit test and set
         bt,
+        /// Check array index against bounds
+        bound,
         /// Call
         /// Fast system call
         call,
@@ -542,17 +608,12 @@ pub const Inst = struct {
         /// Clear interrupt flag
         /// Clear task-switched flag in CR0
         /// Clear user interrupt flag
-        cl,
         /// Cache line demote
-        cldemote,
         /// Flush cache line
-        clflush,
         /// Flush cache line optimized
-        clflushopt,
         /// Clear busy flag in a supervisor shadow stack token
-        clrssbsy,
         /// Cache line write back
-        clwb,
+        cl,
         /// Complement carry flag
         cmc,
         /// Conditional move
@@ -650,15 +711,16 @@ pub const Inst = struct {
         lzcnt,
         /// Move
         /// Move data from string to string
+        /// Move data after swapping bytes
         /// Move scalar single-precision floating-point value
         /// Move scalar double-precision floating-point value
         /// Move doubleword
         /// Move quadword
         /// Move aligned packed integer values
         /// Move unaligned packed integer values
+        /// Move quadword from XMM to MMX technology register
+        /// Move quadword from MMX technology to XMM register
         mov,
-        /// Move data after swapping bytes
-        movbe,
         /// Move with sign extension
         movsx,
         /// Move with zero extension
@@ -671,6 +733,7 @@ pub const Inst = struct {
         /// Multiply scalar double-precision floating-point values
         /// Multiply packed unsigned doubleword integers
         /// Multiply packed doubleword integers
+        /// Carry-less multiplication quadword
         mul,
         /// Two's complement negation
         neg,
@@ -737,6 +800,8 @@ pub const Inst = struct {
         sca,
         /// Send user interprocessor interrupt
         senduipi,
+        /// Serialize instruction execution
+        serialize,
         /// Set byte on condition
         set,
         /// Logical shift left
@@ -758,6 +823,10 @@ pub const Inst = struct {
         /// Subtract scalar single-precision floating-point values
         /// Subtract packed double-precision floating-point values
         /// Subtract scalar double-precision floating-point values
+        /// Packed single-precision floating-point horizontal subtract
+        /// Packed double-precision floating-point horizontal subtract
+        /// Packed horizontal subtract
+        /// Packed horizontal subtract and saturate
         sub,
         /// Set carry flag
         /// Set direction flag
@@ -772,8 +841,6 @@ pub const Inst = struct {
         st,
         /// Store string
         sto,
-        /// Swap GS base register
-        swapgs,
         /// Test condition
         /// Logical compare
         /// Packed bit test
@@ -788,6 +855,8 @@ pub const Inst = struct {
         /// Write to model specific register
         /// Write to model specific register
         /// Write to model specific register
+        /// Write to shadow stack
+        /// Write to user shadow stack
         wr,
         /// Exchange and add
         xadd,
@@ -904,6 +973,10 @@ pub const Inst = struct {
         cmpgt,
         /// Empty MMX technology state
         emms,
+        /// Multiply and add packed signed and unsigned bytes
+        maddubs,
+        /// Multiply and add packed integers
+        maddw,
         /// Multiply packed signed integers and store low result
         mull,
         /// Multiply packed signed integers and store high result
@@ -932,6 +1005,8 @@ pub const Inst = struct {
         unpcklwd,
 
         // SSE
+        /// Average packed integers
+        avg,
         /// Convert packed doubleword integers to packed single-precision floating-point values
         /// Convert packed doubleword integers to packed double-precision floating-point values
         cvtpi2,
@@ -994,9 +1069,13 @@ pub const Inst = struct {
         /// Move unaligned packed single-precision floating-point values
         /// Move unaligned packed double-precision floating-point values
         movu,
+        /// Multiply packed unsigned integers and store high result
+        mulhu,
         /// Prefetch data into caches
         /// Prefetch data into caches with intent to write
         prefetch,
+        /// Compute sum of absolute differences
+        sadb,
         /// Packed interleave shuffle of quadruplets of single-precision floating-point values
         /// Packed interleave shuffle of pairs of double-precision floating-point values
         /// Shuffle packed doublewords
@@ -1056,9 +1135,6 @@ pub const Inst = struct {
         /// Packed single-precision floating-point add/subtract
         /// Packed double-precision floating-point add/subtract
         addsub,
-        /// Packed single-precision floating-point horizontal add
-        /// Packed double-precision floating-point horizontal add
-        hadd,
         /// Replicate double floating-point values
         movddup,
         /// Replicate single floating-point values
@@ -1069,6 +1145,10 @@ pub const Inst = struct {
         // SSSE3
         /// Packed align right
         alignr,
+        /// Packed multiply high with round and scale
+        mulhrs,
+        /// Packed sign
+        sign,
 
         // SSE4.1
         /// Pack with unsigned saturation
@@ -1090,6 +1170,8 @@ pub const Inst = struct {
         /// Extract packed floating-point values
         /// Extract packed integer values
         extract,
+        /// Packed horizontal word minimum
+        hminposu,
         /// Insert scalar single-precision floating-point value
         /// Insert packed floating-point values
         insert,
@@ -1111,10 +1193,6 @@ pub const Inst = struct {
         /// Accumulate CRC32 value
         crc32,
 
-        // PCLMUL
-        /// Carry-less multiplication quadword
-        clmulq,
-
         // AES
         /// Perform one round of an AES decryption flow
         /// Perform ten rounds of AES decryption flow with key locker using 128-bit key
@@ -1634,12 +1712,51 @@ pub const Inst = struct {
         reg_list: RegisterList,
     };
 
-    // Make sure we don't accidentally make instructions bigger than expected.
-    // Note that in safety builds, Zig is allowed to insert a secret field for safety checks.
     comptime {
         if (!std.debug.runtime_safety) {
+            // Make sure we don't accidentally make instructions bigger than expected.
+            // Note that in safety builds, Zig is allowed to insert a secret field for safety checks.
             assert(@sizeOf(Data) == 8);
         }
+        const Mnemonic = @import("Encoding.zig").Mnemonic;
+        if (@typeInfo(Mnemonic).@"enum".fields.len != 977 or
+            @typeInfo(Fixes).@"enum".fields.len != 231 or
+            @typeInfo(Tag).@"enum".fields.len != 251)
+        {
+            const cond_src = (struct {
+                fn src() std.builtin.SourceLocation {
+                    return @src();
+                }
+            }).src();
+            @setEvalBranchQuota(1_750_000);
+            for (@typeInfo(Mnemonic).@"enum".fields) |mnemonic| {
+                if (mnemonic.name[0] == '.') continue;
+                for (@typeInfo(Fixes).@"enum".fields) |fixes| {
+                    const pattern = fixes.name[if (std.mem.indexOfScalar(u8, fixes.name, ' ')) |index| index + " ".len else 0..];
+                    const wildcard_index = std.mem.indexOfScalar(u8, pattern, '_').?;
+                    const mnem_prefix = pattern[0..wildcard_index];
+                    const mnem_suffix = pattern[wildcard_index + "_".len ..];
+                    if (!std.mem.startsWith(u8, mnemonic.name, mnem_prefix)) continue;
+                    if (!std.mem.endsWith(u8, mnemonic.name, mnem_suffix)) continue;
+                    if (@hasField(
+                        Tag,
+                        mnemonic.name[mnem_prefix.len .. mnemonic.name.len - mnem_suffix.len],
+                    )) break;
+                } else @compileError("'" ++ mnemonic.name ++ "' is not encodable in Mir");
+            }
+            @compileError(std.fmt.comptimePrint(
+                \\All mnemonics are encodable in Mir! You may now change the condition at {s}:{d} to:
+                \\if (@typeInfo(Mnemonic).@"enum".fields.len != {d} or
+                \\    @typeInfo(Fixes).@"enum".fields.len != {d} or
+                \\    @typeInfo(Tag).@"enum".fields.len != {d})
+            , .{
+                cond_src.file,
+                cond_src.line - 6,
+                @typeInfo(Mnemonic).@"enum".fields.len,
+                @typeInfo(Fixes).@"enum".fields.len,
+                @typeInfo(Tag).@"enum".fields.len,
+            }));
+        }
     }
 };
 
test/behavior/x86_64/build.zig
@@ -87,7 +87,7 @@ pub fn build(b: *std.Build) void {
         .{
             .cpu_arch = .x86_64,
             .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v2 },
-            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .gfni, .pclmul }),
+            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .fast_hops, .gfni, .pclmul, .slow_incdec }),
         },
         .{
             .cpu_arch = .x86_64,
@@ -97,6 +97,7 @@ pub fn build(b: *std.Build) void {
         .{
             .cpu_arch = .x86_64,
             .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .fast_hops, .gfni, .pclmul, .slow_incdec }),
             .cpu_features_sub = std.Target.x86.featureSet(&.{.avx2}),
         },
         .{
@@ -106,12 +107,11 @@ pub fn build(b: *std.Build) void {
         .{
             .cpu_arch = .x86_64,
             .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
-            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .gfni, .pclmul }),
+            .cpu_features_add = std.Target.x86.featureSet(&.{ .adx, .fast_hops, .gfni, .slow_incdec, .vpclmulqdq }),
         },
         .{
             .cpu_arch = .x86_64,
             .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v4 },
-            .cpu_features_add = std.Target.x86.featureSet(&.{.vpclmulqdq}),
         },
     }) |query| {
         const target = b.resolveTargetQuery(query);
test/behavior/x86_64/unary.zig
@@ -4451,141 +4451,354 @@ fn unary(comptime op: anytype, comptime opts: struct {
         }
         fn testFloatVectors() !void {
             try testArgs(@Vector(1, f16), .{
-                -0x1.17cp-12,
+                0x1.7d8p12,
             });
             try testArgs(@Vector(2, f16), .{
-                0x1.47cp9, 0x1.3acp9,
+                -0x0.054p-14, -0x1.c6cp10,
+            });
+            try testArgs(@Vector(3, f16), .{
+                -0x1.39cp-3, -0x1.088p4, -0x0.644p-14,
             });
             try testArgs(@Vector(4, f16), .{
-                0x1.ab4p0, -0x1.7fcp-7, -0x1.1cp0, -0x1.f14p12,
+                -0x1.108p11, 0x1.364p-3, 0x1.8f4p-2, -0x0.8acp-14,
+            });
+            try testArgs(@Vector(5, f16), .{
+                0x1.e1p8, 0x1.ddp11, 0x0.388p-14, 0x1.7p-7, -0x0.a08p-14,
+            });
+            try testArgs(@Vector(7, f16), .{
+                0x1.988p-14, -0x1.f7p-14, 0x1.38cp12, 0x0.0fp-14, -0x1.774p2, -0x1.de4p11, -0x1.9bp-10,
             });
             try testArgs(@Vector(8, f16), .{
-                -0x1.8d8p8, 0x1.83p10, -0x1.5ap-1, -0x1.d78p13, -0x1.608p12, 0x1.e8p-9, -0x1.688p-10, -0x1.738p9,
+                0x1.6ecp12, -0x1.834p9, -0x1.2c8p13, 0x1.e7cp3, -0x1.418p3, 0x1.15cp-1, 0x1.fecp-2, 0x1.1dp-3,
+            });
+            try testArgs(@Vector(9, f16), .{
+                0x1.da8p-1, 0x1.d44p-11, 0x1.884p-10, -0x1.898p1, 0x1.5ccp-5, 0x1.68p0, 0x1.618p14, -0x1.c34p2,
+                -0x1.318p6,
+            });
+            try testArgs(@Vector(15, f16), .{
+                0x1.41cp11, 0x1.edp-1,   0x1.1c8p-12, -0x0.0ecp-14, -0x1.abp8,   0x1.34p0,  -0x1.24cp-4, -0x1.214p1,
+                -0x1.604p9, -0x1.364p-1, 0x1.adp0,    0x0.63p-14,   0x0.60cp-14, 0x1.6ep-6, 0x0.84cp-14,
             });
             try testArgs(@Vector(16, f16), .{
-                0x1.da8p-1, -0x1.ed4p-10, -0x1.dc8p1,  0x1.b78p-14, nan(f16),    0x1.9d8p8,   nan(f16),     0x1.d5p13,
-                -0x1.2dp13, 0x1.6c4p12,   0x1.a9cp-11, -0x1.0ecp8,  0x0.4ccp-14, -0x1.0a8p-6, -0x1.5bcp-14, 0x1.6d8p-9,
+                0x1.308p6,  -0x1.078p-1, 0x0.81p-14, 0x1.1b4p-14, 0x1.4ep-7,   0x1.75p12,  0x1.264p-8,   0x1.a6p2,
+                0x1.9a4p-3, 0x1.e9p4,    -0x1.a4p-6, 0x1.6acp-1,  0x1.7e8p-12, -0x1.02cp6, -0x1.0ccp-14, 0x1.edp-12,
+            });
+            try testArgs(@Vector(17, f16), .{
+                0x1.2c4p-1,  0x1.91cp-3,   0x1.bf8p10,  -0x0.25p-14, 0x1.45p-9,   0x1.cap-2,   0x1.e9cp8,  0x1.b7p8,
+                0x1.21cp9,   -0x0.ba4p-14, -0x1.ddcp-4, -0x1.bcp9,   -0x1.7dcp-3, 0x1.6a4p-12, 0x1.ca8p-8, -0x1.558p11,
+                0x0.26cp-14,
+            });
+            try testArgs(@Vector(31, f16), .{
+                -0x1.f94p7,   0x1.55cp9,   -0x1.f78p11,  -0x0.f48p-14, -0x1.b6p-2,  0x1.85cp1,    -0x1.114p4,  -0x1.97cp-5,
+                -0x1.6f8p2,   0x1.79cp-3,  0x1.e58p-9,   -0x1.f5cp-10, 0x1.a74p5,   -0x0.1e8p-14, 0x1.15cp-14, 0x1.814p-7,
+                -0x0.318p-14, -0x1.b5p-5,  -0x1.058p-10, 0x1.124p0,    -0x1.20cp-1, 0x1.978p10,   -0x1.808p-8, 0x1.528p-6,
+                -0x1.ba8p9,   0x0.294p-14, 0x1.11cp0,    0x1.e5p5,     0x1.904p-11, 0x1.d78p11,   -0x1.c1p5,
             });
             try testArgs(@Vector(32, f16), .{
-                0x1.d5cp-6,  -0x1.a98p5,  0x1.49cp5,   -0x1.e4p-1,  -0x1.21p-13, -0x1.c94p-1, -0x1.adcp-5, -0x1.524p-1,
-                -0x1.0d8p-3, -0x1.5c4p-2, 0x1.f84p-2,  0x1.664p1,   -0x1.f64p13, -0x1.bf4p4,  -0x1.4b8p0,  -0x0.f64p-14,
-                -0x1.3f8p1,  0x1.098p2,   -0x1.a44p8,  0x1.048p13,  0x1.fd4p-11, 0x1.18p-9,   -0x1.504p2,  0x1.d04p7,
-                -nan(f16),   0x1.a94p2,   0x0.5e8p-14, -0x1.7acp-7, 0x1.4c8p-3,  0x1.518p-4,  nan(f16),    0x1.8f8p10,
+                -0x0.11p-14, 0x0.29cp-14, 0x1.7a8p5,    0x1.49cp-11,  0x1.6c4p-3,   -0x1.85cp-11, 0x1.ap-8,   -0x0.49cp-14,
+                0x1.dfp2,    -0x1.4cp1,   0x1.138p-5,   -0x1.45p-9,   0x0.88cp-14,  0x1.6acp10,   0x1.594p3,  0x1.704p6,
+                -0x1.c34p13, 0x1.44cp0,   -0x1.cfcp-10, 0x1.5c8p-4,   -0x1.b2cp-10, -0x1.178p1,   -0x1.b74p7, -0x1.d18p0,
+                0x1.0fcp-9,  0x1.b6p-11,  -0x1.ff4p-2,  -0x0.0b8p-14, 0x1.4dcp-10,  -0x1.af4p-5,  -0x1.eap2,  -0x1.79cp-4,
+            });
+            try testArgs(@Vector(33, f16), .{
+                -0x1.6e8p0,  -0x1.304p-12, 0x1.558p11,  0x1.cf4p13,  0x1.cc4p-9,   0x1.d88p-11,  0x1.838p8,   -0x1.2ecp-10,
+                -0x1.65cp-1, -0x1.644p8,   -0x1.048p10, 0x0.114p-14, 0x1.8a4p13,   0x1.c9p-3,    0x1.dfp-6,   -0x1.774p12,
+                -0x0.4dp-14, 0x1.2ccp-12,  0x0.98p-14,  -0x1.b18p-6, 0x0.1ecp-14,  0x0.86cp-14,  0x0.6e8p-14, -0x1.6dp14,
+                0x1.9e8p-3,  0x1.1ep10,    -0x1.6cp13,  -0x1.d44p1,  -0x1.f54p-12, -0x1.fe8p-14, 0x1.968p-1,  -0x1.ab4p-9,
+                0x1.f0cp0,
+            });
+            try testArgs(@Vector(63, f16), .{
+                -0x1.3ecp-1,  0x0.04p-14,  -0x1.1cp-2,   0x1.0dp10,    0x1.ddcp-12,  -0x1.57cp-11, -0x1.84p-9,  0x1.dfp4,
+                0x1.6e4p-9,   0x0.5d4p-14, -0x0.51cp-14, -0x1.bp2,     -0x1.8ecp-14, 0x1.268p-2,   -0x0.69p-14, -0x1.b98p7,
+                -0x0.cb4p-14, -0x1.accp-3, 0x1.cdcp6,    -0x1.e6p7,    0x1.4ep-14,   0x1.5fp5,     -0x1.95p8,   0x1.044p8,
+                -0x1.e14p9,   0x1.e84p14,  0x1.ee8p-10,  -0x1.0a4p8,   0x1.b14p-8,   -0x1.5dp9,    0x0.e68p-14, -0x0.1acp-14,
+                -0x1.7ccp-11, 0x1.45p-10,  0x0.044p-14,  0x1.078p4,    0x1.c8p-1,    -0x1.8fp11,   -0x1.cbp0,   -0x1.208p-10,
+                -0x1.a5p-1,   -0x1.164p-8, -0x1.304p-3,  -0x1.038p-10, -0x1.4dp11,   0x0.248p-14,  0x1.09cp-4,  -0x1.a7cp14,
+                -0x1.a38p-6,  -0x1.0bp-9,  -0x1.fecp-14, -0x1.c78p-10, -0x1.e38p-11, 0x1.47p-5,    -0x1.3bcp5,  0x1.6a4p9,
+                0x0.728p-14,  0x1.9c8p9,   0x1.88p12,    -0x1.e6p0,    0x1.5dcp-2,   -0x1.7f4p-4,  -0x1.a6p3,
             });
             try testArgs(@Vector(64, f16), .{
-                -0x1.c2p2,   0x0.2fcp-14,  0x1.de8p0,    -0x1.714p2,   0x1.f9p-7,    -0x1.11cp-13, -0x1.558p10, -0x1.2acp-7,
-                0x1.348p14,  0x1.2dcp7,    -0x1.8acp-12, -0x1.2cp2,    0x1.868p1,    -0x1.1f8p-14, 0x1.638p7,   -0x1.734p-5,
-                0x0.b98p-14, -0x1.7f4p-12, -0x1.38cp15,  0x1.50cp15,   0x1.91cp8,    0x1.cb4p-1,   0x1.fc4p-13, 0x1.9a4p0,
-                0x1.18p-4,   0x1.60cp10,   0x1.6fp-12,   0x1.b48p6,    0x1.37cp-11,  0x1.424p7,    0x1.44cp13,  0x1.aep5,
-                0x1.968p14,  0x1.e8p13,    -0x1.bp2,     -0x1.644p5,   0x1.de4p-8,   -0x1.5b4p-14, -0x1.4ap1,   -0x1.868p9,
-                -0x1.d14p0,  0x1.d7cp15,   0x1.3c8p14,   0x1.2ccp-14,  -0x1.ee4p8,   0x1.49p-3,    0x1.35cp12,  0x1.d34p6,
-                0x1.7acp3,   -0x1.fa4p2,   0x1.7b4p13,   -0x1.cf4p-12, -0x1.ebcp-10, -0x1.5p-3,    0x1.4bp-6,   0x1.83p12,
-                -0x1.f9cp-8, -0x1.43p-8,   -0x1.99p-1,   -0x1.dacp3,   -0x1.728p-4,  -0x1.03cp4,   0x1.604p-2,  -0x1.0ep13,
+                -0x1.67cp-13, 0x1.f2cp-10, 0x1.69cp11,  -0x1.0dp-2,  0x1.a8p9,     0x1.7dp-11,   0x1.908p-5,  -0x1.37cp0,
+                0x1.8f8p5,    0x1.38p11,   0x1.d2p8,    0x1.b74p-10, -0x1.188p-7,  0x1.578p5,    0x1.68p-11,  -0x1.b9cp8,
+                -0x1.ba4p2,   0x0.b78p-14, 0x1.458p-8,  0x0.054p-14, -0x0.63p-14,  0x1.83p10,    0x1.94cp-2,  -0x1.d7p2,
+                -0x1.62p4,    0x1.b34p4,   -0x1.4cp-11, -0x1.714p9,  -0x1.ce4p1,   0x1.75p-3,    -0x1.cbp-13, 0x1.714p6,
+                -0x1.cb8p7,   -0x1.b98p-4, 0x1.facp-13, -0x1.1f4p8,  -0x1.92p-3,   0x0.144p-14,  0x1.504p-4,  0x1.a9p-10,
+                0x1.a94p3,    0x1.708p-2,  0x1.c84p-14, 0x1.77cp9,   -0x0.1e4p-14, -0x0.3d8p-14, -0x1.f8p4,   -0x1.2bp5,
+                0x1.5b8p-14,  0x1.898p14,  -0x1.e2p3,   -0x1.0e8p-5, 0x1.4dcp-12,  0x1.368p8,    0x1.968p-7,  -0x1.98cp-5,
+                0x1.39cp-13,  0x1.23p2,    0x1.8e8p6,   0x1.344p7,   0x1.70cp-5,   -0x1.f24p11,  -0x1.54p-7,  -0x1.904p3,
+            });
+            try testArgs(@Vector(65, f16), .{
+                -0x1.d78p-4, 0x1.ea8p-8,   -0x1.b4cp6,   -0x1.c7cp4,   0x1.dfcp7,    0x1.a8cp6,   -0x1.768p11,  0x0.0fp-14,
+                -0x1.a3p-4,  -0x1.868p-9,  0x1.23p-1,    -0x1.2e8p3,   -0x1.9e8p-12, 0x1.8a8p3,   0x1.168p-5,   -0x1.608p8,
+                -0x1.9d4p-4, -0x1.17cp-1,  -0x1.f2p1,    -0x1.d38p-11, 0x1.f38p-12,  -0x1.92p-11, 0x1.c44p6,    0x1.4fp-3,
+                0x0.18p-14,  0x1.3dp11,    -0x1.ce4p9,   -0x1.bf8p-12, 0x0.88cp-14,  -0x1.998p-9, 0x1.788p-2,   -0x1.5c4p2,
+                0x0.08cp-14, -0x0.6f8p-14, 0x1.c7cp-10,  -0x0.1p-14,   -0x1.0fcp-9,  -0x1.5a4p6,  -0x1.8c8p-12, 0x0.57p-14,
+                -0x1.96cp-9, 0x1.6ecp10,   -0x1.c18p1,   -0x1.0ap5,    -0x0.768p-14, -0x1.f8cp-6, 0x0.44p-14,   -0x1.2b4p-2,
+                0x1.efcp-13, -0x1.434p-13, 0x1.434p-3,   0x1.a6p-2,    0x1.bc4p7,    -0x0.e1p-14, -0x1.d9cp-7,  -0x1.f94p-9,
+                0x1.448p-6,  0x1.0d8p3,    -0x0.4a4p-14, -0x1.25cp-10, 0x1.c18p12,   0x0.1ccp-14, -0x1.ep14,    -0x1.42cp6,
+                0x1.14p8,
             });
 
             try testArgs(@Vector(1, f32), .{
-                -0x1.17cp-12,
+                0x1.12e082p8,
             });
             try testArgs(@Vector(2, f32), .{
-                -0x1.a3123ap90, -0x1.4a2ec6p-54,
+                -0x1.f04666p17, 0x1.27d624p4,
+            });
+            try testArgs(@Vector(3, f32), .{
+                -0x1.c3168cp-85, -0x1.169cdcp9, -0x1.4bdb2ap13,
             });
             try testArgs(@Vector(4, f32), .{
-                -0x1.8a41p77, -0x1.7c54e2p-61, -0x1.498556p-41, 0x1.d77c22p-20,
+                -0x1.a8b1d6p29, -0x1.b94e32p-76, 0x1.f4d9aap-43, 0x1.e6c654p44,
+            });
+            try testArgs(@Vector(5, f32), .{
+                0x1.37c57ep-53,  -0x1.832c84p49, -0x1.04256ep-110, -0x1.de4454p-37,
+                -0x1.a36832p-34,
+            });
+            try testArgs(@Vector(7, f32), .{
+                -0x1.35df86p87, -0x1.d96a52p62, 0x1.f9d3ecp-12, 0x1.5f4cc6p112,
+                0x1.176cfap94,  0x1.bb86fcp69,  0x1.015e56p0,
             });
             try testArgs(@Vector(8, f32), .{
-                0x1.943da4p-86, 0x1.528792p95,  -0x1.9c9bfap-26, -0x1.8df936p-90,
-                -0x1.6a70cep56, 0x1.626638p-48, 0x1.7bb2bap-57,  -0x1.ac5104p94,
+                -0x1.9dd6cap3,   0x1.726066p-42, 0x1.5b1f5ep-20, -0x1.347ed6p29,
+                0x1.bfb5d4p-126, -0x1.b0e8dp45,  0x1.5577bep45,  -0x1.9d1608p2,
+            });
+            try testArgs(@Vector(9, f32), .{
+                -0x1.4159b2p76,  0x1.bea7b8p-107, -0x1.b47036p-82, -0x1.4635ap-26,
+                -0x1.27bc98p-47, 0x1.1e0ap-116,   0x1.0f628p-118,  0x1.2e63bcp-62,
+                0x1.d0e45ep-57,
+            });
+            try testArgs(@Vector(15, f32), .{
+                0x1.65e0bcp-12, 0x1.d947c6p-42, -0x1.4596acp64,   -0x1.2a897cp75,
+                0x1.cb074ap-8,  0x1.e44a98p-62, -0x1.3edb2p74,    0x1.07aecep-2,
+                -0x1.fda1f8p14, 0x1.2f2c7ap-95, 0x1.9814e6p-33,   0x1.6d6a58p3,
+                0x1.6a1478p-3,  -0x1.85886ap64, -0x1.e2b9bcp-114,
             });
             try testArgs(@Vector(16, f32), .{
-                0x1.157044p115, -0x1.416c04p-111, 0x1.a8f164p-104, 0x1.9b6678p84,
-                -0x1.9d065cp9,  -0x1.e8c4b4p126,  -0x1.ddb968p84,  -0x1.fec8c8p74,
-                0x1.64ffb2p59,  0x1.548922p20,    0x1.7270fcp22,   -0x1.abac68p33,
-                0x1.faabfp33,   -0x1.8aee82p55,   0x1.1bf8fp75,    0x1.33c46ap-66,
+                0x1.348b38p103,  0x1.bbc8e4p8,   -0x1.03f48ap-119, -0x1.90f87cp115,
+                -0x1.88aaaep28,  -0x1.21ec4p-94, 0x1.e1f21cp-57,   0x1.0e7dd2p-37,
+                -0x1.5963a2p-24, 0x1.4c314cp-61, -0x1.753d5ap113,  -0x1.65705p-12,
+                -0x1.e34902p-54, -0x1.ab8022p87, -0x1.5cc252p-99,  0x1.4f4fe6p41,
+            });
+            try testArgs(@Vector(17, f32), .{
+                0x1.6be79ap-19, -0x1.38819p-21,  -0x1.8551dp2,     -0x1.43155ep-126,
+                0x1.96e6p108,   0x1.58abaap41,   0x1.145ffcp124,   -0x1.8e314ep-41,
+                -0x1.63151p42,  0x1.9585e8p124,  0x1.4bdd42p-66,   0x1.858674p-45,
+                -0x1.bccb68p66, -0x1.88e0e8p-14, -0x1.e0461cp-116, 0x1.3c1e2ep120,
+                -0x1.0076dep14,
+            });
+            try testArgs(@Vector(31, f32), .{
+                0x1.8d5b34p-49,   -0x1.bd019cp-83, -0x1.1d06e2p-95, -0x1.d9ac6ap-45,
+                0x1.f942dap10,    -0x1.c23402p121, -0x1.8e5656p-32, 0x1.925222p-53,
+                -0x1.16440ep-117, 0x1.b146cep107,  -0x1.b58cdep-52, 0x1.713f34p8,
+                0x1.3de424p99,    -0x1.3e6d6ep-28, -0x1.8261b4p-69, 0x1.043d66p-91,
+                -0x1.fbcd6ep113,  0x1.7934dcp-47,  0x1.fa8152p99,   0x1.c29968p-58,
+                0x1.77f26ap82,    0x1.4602aap-57,  -0x1.8a4cb4p8,   0x1.d48cdap113,
+                0x1.636a7ep29,    0x1.730262p57,   0x1.29e668p7,    0x1.58592cp20,
+                0x1.d09ebp-107,   0x1.7a85c6p-39,  0x1.38e1d6p44,
             });
             try testArgs(@Vector(32, f32), .{
-                -0x1.039b68p37,   -0x1.34de4ap-74, -0x1.05d78ap-76, -0x1.be0f5ap-47,
-                0x1.032204p-38,   0x1.ef8e2ap-78,  -0x1.b013ecp-80, 0x1.71fe4cp99,
-                0x1.abdadap-14,   0x1.56a9a8p-48,  -0x1.8bbd7ep9,   0x1.edd308p-72,
-                -0x1.92fafcp-121, -0x1.50812p19,   0x1.f4ddc4p28,   -0x1.6f0b12p-50,
-                -0x1.12ab02p127,  0x1.24df48p21,   -0x1.993c3p-14,  -0x1.4cc476p-112,
-                0x1.13d9a8p-40,   0x1.a6e652p-9,   -0x1.9c730cp-21, -0x1.a75aaap-70,
-                -0x1.39e632p-111, 0x1.8e8da8p-45,  0x1.b5652cp31,   0x1.258366p44,
-                0x1.d473aap92,    -0x1.951b64p9,   0x1.542edp15,    -0x0.f6222ap-126,
+                -0x1.95dec4p-65,  0x1.3833cp65,    -0x1.0ef5ap-53,  0x1.86e4c8p101,
+                -0x1.713132p24,   -0x1.c6fd0ep123, -0x1.75aadcp88,  -0x1.b8f0fp18,
+                0x1.0f5b8ep-34,   -0x1.0d0d66p-15, 0x0.842836p-126, -0x1.157782p22,
+                -0x1.025e8ap-100, 0x1.be825ep117,  0x1.d3efc6p-45,  0x1.ed8462p-34,
+                -0x1.b373c8p-118, -0x1.dbfd16p4,   0x1.73ee9p-56,   -0x1.cdff48p-69,
+                0x1.1b806ep-78,   0x1.65a58ap-4,   -0x1.0d851cp77,  0x1.442c12p41,
+                0x1.215116p47,    -0x1.75f266p-48, 0x1.2273d4p89,   0x1.1bab24p-100,
+                -0x1.0300ep-22,   0x1.8c199cp-70,  -0x1.70e08cp-66, 0x1.aa6b3ep-24,
+            });
+            try testArgs(@Vector(33, f32), .{
+                -0x1.4eddccp-116, 0x1.724e18p-94,  -0x1.9d40bep54,  -0x1.0afc5p-14,
+                0x1.576c2p92,     0x1.cf52b6p110,  -0x1.7e67ep117,  -0x1.7db66ep90,
+                0x1.3eac22p-38,   0x1.6ba068p72,   -0x1.72dc2cp97,  -0x1.4193f4p72,
+                0x1.aa81f6p86,    0x1.984268p53,   -0x1.14ba6ep-45, 0x1.15603ep-122,
+                0x1.85e75p-56,    0x1.108a82p-121, 0x1.569ecp62,    -0x1.7f3268p-68,
+                -0x1.d0964ep0,    0x0.f7a596p-126, -0x1.367646p-11, 0x1.2065bp-26,
+                0x1.cc954ap125,   -0x1.956e1cp65,  0x1.774dep112,   0x1.69dfcep-16,
+                -0x1.b0efb2p76,   0x1.14c54p70,    -0x1.7c6b08p25,  0x1.ae20b4p31,
+                -0x1.73c584p-118,
             });
 
             try testArgs(@Vector(1, f64), .{
-                -0x1.0114613df6f97p816,
+                0x1.58849bfb1303cp-254,
             });
             try testArgs(@Vector(2, f64), .{
-                -0x1.8404dad72003cp720, -0x1.6b14b40bcf3b7p-176,
+                -0x1.b4a24030f3facp215, -0x1.c1bdddbc41cdep950,
+            });
+            try testArgs(@Vector(3, f64), .{
+                -0x1.7d154dcee386cp-284, -0x1.2fdda9cbabfap-84,
+                0x1.00c86a9c3de5cp-46,
             });
             try testArgs(@Vector(4, f64), .{
-                -0x1.04e1acbfddd9cp681, -0x1.ed553cc056da7p-749,
-                0x1.3d3f703a0c893p-905, 0x1.0b35633fa78fp691,
+                0x1.70f298f25a9bfp826,   0x1.4b944832c8eecp-319,
+                -0x1.d801afafdbc01p-708, -0x1.65d0b4b097a57p-872,
+            });
+            try testArgs(@Vector(5, f64), .{
+                -0x1.4796bdf4c112bp938, 0x1.3661030c6a2fp-156,
+                -0x1.20d194f89bc7fp-9,  -0x1.f545d17a1d9e8p604,
+                0x1.c786013e7205ep-514,
+            });
+            try testArgs(@Vector(7, f64), .{
+                -0x1.8f6d6e549941fp501, -0x1.56374640d779p-762,
+                -0x1.4ea02d12bd9cfp209, -0x1.ab85b639e78c6p-879,
+                -0x1.fcd56fe4f85abp47,  -0x1.8963745584169p-957,
+                -0x1.581a8a0033e8p915,
             });
             try testArgs(@Vector(8, f64), .{
-                -0x1.901a2a60f0562p-301, -0x1.2516175ad61ecp-447,
-                0x1.e7b12124846bfp564,   0x1.9291384bd7259p209,
-                -0x1.a7bf62f803c98p900,  0x1.4e2e26257bb3p987,
-                -0x1.413ca9a32d894p811,  0x1.61b1dd9432e95p479,
+                -0x1.2a8fb1782b7f2p-126, -0x1.b246d12815c21p606,
+                0x1.6bc24f2a268b9p837,   0x1.1d550478ebd71p1016,
+                0x1.d2ba52815edc2p252,   0x1.a8d87e5eb97ecp-450,
+                -0x1.c8a3d899aa89p601,   -0x1.1fa47083d9a8fp289,
+            });
+            try testArgs(@Vector(9, f64), .{
+                -0x1.312d39a09757p-567,  -0x1.4b0ef2ac9424ep-10,
+                0x1.84302715c6852p930,   -0x1.01565f82fd32p761,
+                -0x1.36ad9c057719ap-351, 0x1.dc4929f2400c8p793,
+                -0x1.e90f3ae855d3dp-474, 0x1.4e65fb145865ep-834,
+                0x1.4236a94937ee3p-987,
+            });
+            try testArgs(@Vector(15, f64), .{
+                0x1.df73a72937309p351,  -0x1.73506ab182b9p-23,
+                0x1.b2c954612187p-997,  0x1.7c5ee7c602989p-93,
+                -0x1.5edba35428d13p762, -0x1.e3bc1f194dc8cp-386,
+                0x1.ca056fb59bdb9p651,  0x1.e59b99b174a0dp-528,
+                0x1.7a995c7651aa7p929,  -0x1.a25d3d5153405p413,
+                0x1.e5579317d4b37p-50,  0x1.f9d5578c67f67p-90,
+                -0x1.5da751d423506p611, 0x1.9a2cba7bf2467p488,
+                0x1.db3d45f662c4ep-619,
             });
             try testArgs(@Vector(16, f64), .{
-                -0x1.8fc7286d95f54p-235,  -0x1.796a7ea8372b6p-837,
-                -0x1.8c0f930539acbp-98,   -0x1.ec80dfbf0b931p-430,
-                -0x1.e3d80c640652fp-1019, 0x1.8241238fb542fp161,
-                -0x1.e1f1a79d50263p137,   -0x1.9ac5cb2771c28p-791,
-                0x1.4d8f00fe881e7p-401,   -0x1.87fbd7bfd99d7p346,
-                -0x1.a8a7cc575335ep1017,  0x1.37bb88dc3fd8bp-355,
-                0x1.9d53d346c0e65p929,    -0x1.bbae3d0229c34p289,
-                -0x1.cb8ef994d5ce5p25,    0x1.ba20af512616ap50,
+                0x1.fd61de463a33cp898,  -0x1.47be52b4f1241p-18,
+                0x1.729aa777312a3p-930, -0x1.2db258cd9984dp895,
+                0x1.a1fbc900c10cbp517,  -0x1.e93dfa8923807p815,
+                -0x1.e8f19fc0aa2a8p191, -0x1.1b084206321d5p861,
+                -0x1.0be3c6310c58ep457, 0x1.816c3bcf4b9f5p-504,
+                0x1.ec4b026b00c91p-831, 0x1.e42d18f5c7e4bp924,
+                -0x1.f1483ecd74646p560, -0x1.cc5aea97d2264p447,
+                -0x1.a0b1e5b69d166p597, 0x1.e9a109fcf1358p694,
+            });
+            try testArgs(@Vector(17, f64), .{
+                -0x1.cd163cf2878e5p-934, -0x1.ce0ad5b67552p196,
+                -0x1.da0fd3a62b298p508,  0x1.1981c99b14943p3,
+                0x1.d2f6461a9d1a9p390,   -0x1.e8e877d3b4e96p-539,
+                -0x1.8ad9d3e185c43p864,  0x1.61786be9783eep-110,
+                -0x1.1f4be91d90cc3p-500, 0x1.71cacdd984837p956,
+                0x1.7b6ae301fd95ep-661,  0x1.24571ba56e32p343,
+                0x1.b1a9454ab9481p648,   -0x1.887873f8044fep842,
+                -0x1.2f4ee57b9de22p-967, -0x1.c931346ced885p-983,
+                0x1.fe31b9923796bp-772,
             });
 
             try testArgs(@Vector(1, f80), .{
-                -0x1.a2e9410a7dfedabp-2324,
+                -0x1.482098130df28b74p12578,
             });
             try testArgs(@Vector(2, f80), .{
-                -0x1.a2e9410a7dfedabp-2324,
-                0x1.2b17da3b9746885p-8665,
+                -0x1.275157565b1eee5ep14003,
+                0x1.a27b82ef4be6132ap3681,
+            });
+            try testArgs(@Vector(3, f80), .{
+                0x1.9825fbd9b22021fep-10432,
+                -0x1.b8c8c4e5e3911ca8p13568,
+                0x1.aa99cc199c8e524p9865,
             });
             try testArgs(@Vector(4, f80), .{
-                -0x1.c488fedb7ab646cep-13007,
-                0x1.e914deaccaa50016p2073,
-                -0x1.d1c7ae8ec3c9df86p10642,
-                -0x1.2da1658f337fa01p9893,
+                -0x1.9d8ab0a36953d0f6p-760,
+                0x1.869b464121ce6576p-13660,
+                0x1.a54b1d1e8ae2b62ap12073,
+                -0x1.2abe41c9a9d89ea4p-13141,
+            });
+            try testArgs(@Vector(5, f80), .{
+                0x1.0fb10e205522f5aep-15041,
+                -0x1.13e0c338580504dap10809,
+                0x1.50e7c6666fd851acp-5508,
+                -0x1.e2231120481fc762p-8351,
+                0x1.4fae86dc45b06fe2p10741,
+            });
+            try testArgs(@Vector(7, f80), .{
+                -0x1.fe8f8caa4e8697ecp-2992,
+                0x1.2623c910a340e286p-14518,
+                0x1.c5524642a438569p-9469,
+                0x1.3d416ca0a47c73cep2981,
+                0x1.a3a1eb1243923114p-6689,
+                -0x1.a55df9ded3010b1cp-5798,
+                -0x1.3d593df395b03e5ap-14382,
             });
             try testArgs(@Vector(8, f80), .{
-                -0x1.bed8a74c43750656p890,
-                -0x1.7bf57f38004ac976p8481,
-                -0x1.9cdc10ac0657d328p7884,
-                0x1.c86f61883da149fp12293,
-                -0x1.528d6957df6bfdd8p14125,
-                -0x1.5ebb4006d0243bfep14530,
-                -0x1.94b9b18636d12402p-1845,
-                -0x1.25439a6d68add188p5962,
+                -0x1.9bb73ea024f4167cp3116,
+                0x1.adf6241753b29ed2p-4428,
+                -0x1.1494fa8680f9f5f4p2008,
+                -0x1.c68a673c59edeb24p2377,
+                0x1.26c7ab4021afb6dcp1376,
+                0x1.c829b0b3935a2ac6p-11758,
+                -0x1.11e39b110c2fb122p-3836,
+                -0x1.6db14745e291d466p1604,
+            });
+            try testArgs(@Vector(9, f80), .{
+                0x1.f6e537676c132cc6p-10213,
+                -0x1.b86eadf24d8c80eep808,
+                -0x1.54bc27c9a9a2348cp-2369,
+                -0x1.99453820b245bc5p-840,
+                -0x1.93c299090fd981e6p-5264,
+                -0x1.c742059979281ec4p-6347,
+                -0x1.e3efe7b892591d3p-1877,
+                -0x1.350c20a2d59c67dap-8972,
+                -0x1.e3879f20ffc62ff2p-2600,
             });
 
             try testArgs(@Vector(1, f128), .{
-                -0x1.d1e6fc3b1e66632e7b79051a47dap14300,
+                -0x1.274ece23c1832bfe66a1bc59cf87p-8354,
             });
             try testArgs(@Vector(2, f128), .{
-                0x1.84b3ac8ffe5893b2c6af8d68de9dp-83,
-                -0x1.438ca2c8a0d8e3ee9062d351c46ep-10235,
+                0x1.838a4e7ba1e2191cebe701eac5d4p6581,
+                0x1.cdfbda51a2adbce757d7c2e0981bp446,
+            });
+            try testArgs(@Vector(3, f128), .{
+                -0x1.ff45938938f76db417c980c368c6p-7215,
+                -0x1.277a316793a0172e49c7227952ccp10618,
+                0x1.d85027eb4f4ed3512c10bff9a199p-8465,
             });
             try testArgs(@Vector(4, f128), .{
-                0x1.04eb03882d4fd1b090e714d3e5ep806,
-                -0x1.4082b29f7c26e701764c915642ffp-6182,
-                -0x1.b6f1e8565e5040415110f18b519ap13383,
-                0x1.1c29f8c162cead9061c5797ea15ap11957,
+                -0x1.43d8ecf283d4ec6fc4993f385386p-12233,
+                -0x1.384424d239aa2ed9719d2c2d1e58p7346,
+                -0x1.d33fd11001f0ab6d0f9a2790b41cp14692,
+                -0x1.40219a635ef4b042cfb9d7bd9781p900,
+            });
+            try testArgs(@Vector(5, f128), .{
+                -0x1.3273c97faf4619baedaebb51148fp9085,
+                -0x1.f381263ad1033a071dff3a143b14p-13649,
+                -0x1.24b24810f9a1f9b5d1542e2b5841p1425,
+                -0x1.df9e062d482c2bbae0b8fcb07efep-5044,
+                -0x1.15cbca8b8384412d7d09ff76bfe4p-2424,
+            });
+            try testArgs(@Vector(7, f128), .{
+                -0x1.0972e6da79fa8bcd49431d813ea5p12192,
+                0x1.568e3e61ac4fb17303e4ead041dcp-2542,
+                0x1.a55c3f0014942187e6d40c72f12p-13437,
+                -0x1.31fb0ec6dbdf7e4ea8ecc307e6f4p13767,
+                -0x1.5dcc12514e3e540fea9dbd257935p-8938,
+                -0x1.32471cd1d5d2a36e9148a8ce879ap-3274,
+                -0x1.3fd3eb6d86a14567e49f358cf029p-4569,
             });
             try testArgs(@Vector(8, f128), .{
-                -0x1.53d7f00cd204d80e5ff5bb665773p11218,
-                -0x1.4daa1c81cffe28e8fa5cd703c287p2362,
-                -0x1.cc6a71c3ad4560871efdbd025cd7p-8116,
-                -0x1.87f8553cf8772fb6b78e7df3e3bap14523,
-                -0x1.14b6880f6678f86dfb543dde1c6ep2105,
-                0x1.9d2d4398414da9d857e76e8fd7ccp-13668,
-                0x1.a37f07af240ded458d103c022064p-1158,
-                0x1.425d53e6bd6070b847e5da1ed593p1394,
+                -0x1.05fe5035b415bdc5f8f9ae4c8815p455,
+                -0x1.fafde904d5cad82413daee7b88b8p-244,
+                0x1.53041230913c654449b12eb4d89bp2214,
+                -0x1.12d9f4b006063e9c0c7bdf19f61ap-2483,
+                0x1.aee9d4ba013f668773e4f0fd9002p5461,
+                0x1.a6776670633403e78a3cc6fcf8fdp8324,
+                -0x1.392aa756df3b993ea9db22def53ep15136,
+                0x1.823ef104549bdd4624961a44736cp-1097,
+            });
+            try testArgs(@Vector(9, f128), .{
+                -0x1.bde12739521a2bff70e510a6aca3p12384,
+                -0x1.0001c77658eb15cd7cb631b4836bp2147,
+                -0x1.f24c72b8cde26d95bd40f689a2aep-1416,
+                -0x1.61957e7946030c0432af0381f64ap-9492,
+                -0x1.631851492fa27fe7adc7441e0d21p16144,
+                -0x1.9dd39ece97e7a70c6d36e7e3026p-15761,
+                0x1.b044e441d7377755389d0bab3256p-1181,
+                0x1.5c11719701b7ff21384fbbf32922p-1671,
+                -0x1.1a2944a4dff2a4f96732bf03e8f7p-10567,
             });
         }
     };
@@ -4897,6 +5110,15 @@ test reduceMul {
     try test_reduce_mul.testIntVectors();
 }
 
+inline fn reduceAddOptimized(comptime Type: type, rhs: Type) @typeInfo(Type).vector.child {
+    @setFloatMode(.optimized);
+    return @reduce(.Add, rhs);
+}
+test reduceAddOptimized {
+    const test_reduce_add_optimized = unary(reduceAddOptimized, .{ .compare = .approx });
+    try test_reduce_add_optimized.testFloatVectors();
+}
+
 inline fn splat(comptime Type: type, rhs: Type) Type {
     return @splat(rhs[0]);
 }
test/behavior/floatop.zig
@@ -290,14 +290,21 @@ test "vector cmp f128" {
 }
 
 test "vector cmp f80/c_longdouble" {
-    if (true) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .hexagon) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .powerpc64le) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try testCmpVector(f80);
     try comptime testCmpVector(f80);
     try testCmpVector(c_longdouble);
     try comptime testCmpVector(c_longdouble);
 }
+
 fn testCmpVector(comptime T: type) !void {
+    @setEvalBranchQuota(2_000);
     var edges = [_]T{
         -math.inf(T),
         -math.floatMax(T),
test/cases/float_mode_optimized_reduce.zig
@@ -8,5 +8,5 @@ pub fn main() void {
 }
 
 // run
-// backend=llvm
-//
+// backend=stage2,llvm
+// target=x86_64-linux