Commit a4a1ebdeed

Jacob Young <jacobly0@users.noreply.github.com>
2025-05-24 17:37:13
x86_64: implement optimized float `@reduce(.Mul)`
1 parent 612f578
lib/std/zig/Zir.zig
@@ -2142,7 +2142,7 @@ pub const Inst = struct {
         ref_start_index = static_len,
         _,
 
-        pub const static_len = 105;
+        pub const static_len = 109;
 
         pub fn toRef(i: Index) Inst.Ref {
             return @enumFromInt(@intFromEnum(Index.ref_start_index) + @intFromEnum(i));
@@ -2255,11 +2255,15 @@ pub const Inst = struct {
         vector_1_u256_type,
         vector_4_f16_type,
         vector_8_f16_type,
+        vector_16_f16_type,
+        vector_32_f16_type,
         vector_2_f32_type,
         vector_4_f32_type,
         vector_8_f32_type,
+        vector_16_f32_type,
         vector_2_f64_type,
         vector_4_f64_type,
+        vector_8_f64_type,
         optional_noreturn_type,
         anyerror_void_error_union_type,
         adhoc_inferred_error_set_type,
src/arch/x86_64/CodeGen.zig
@@ -2389,7 +2389,7 @@ fn genBodyBlock(self: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 }
 
 fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
-    @setEvalBranchQuota(23_600);
+    @setEvalBranchQuota(23_800);
     const pt = cg.pt;
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
@@ -68441,7 +68441,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -68465,7 +68465,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -68489,7 +68489,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -68517,7 +68517,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -68546,7 +68546,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -68575,7 +68575,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -68642,7 +68642,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -68668,7 +68668,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -68694,7 +68694,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -68724,7 +68724,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -68755,7 +68755,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -68786,7 +68786,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -68856,7 +68856,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -68882,7 +68882,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -68908,7 +68908,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -68938,7 +68938,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -68969,7 +68969,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -69000,7 +69000,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -69070,7 +69070,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69096,7 +69096,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69122,7 +69122,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -69149,7 +69149,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } },
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -69178,7 +69178,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } },
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -69247,7 +69247,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69274,7 +69274,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69301,7 +69301,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -69329,7 +69329,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } },
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -69360,7 +69360,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } },
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -69432,7 +69432,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69459,7 +69459,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69486,7 +69486,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -69515,7 +69515,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } },
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -69546,7 +69546,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } },
-                        .{ .type = .vector_16_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -69618,7 +69618,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69644,7 +69644,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69670,7 +69670,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -69698,7 +69698,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword, .smear = 8 } } },
-                        .{ .type = .vector_32_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_32_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -69768,7 +69768,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69795,7 +69795,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69822,7 +69822,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -69851,7 +69851,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword, .smear = 8 } } },
-                        .{ .type = .vector_32_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_32_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -69924,7 +69924,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69951,7 +69951,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -69978,7 +69978,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
                         .unused,
@@ -70007,7 +70007,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword, .smear = 8 } } },
-                        .{ .type = .vector_32_u8, .kind = .reverse_bits_mem },
+                        .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .reverse } },
                         .{ .type = .vector_32_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
                         .unused,
@@ -70080,7 +70080,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70106,7 +70106,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70132,7 +70132,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .unused,
                         .unused,
                         .unused,
@@ -70203,7 +70203,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
                         .unused,
                         .unused,
@@ -70231,7 +70231,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .unused,
                         .unused,
                         .unused,
@@ -70305,7 +70305,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70332,7 +70332,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70359,7 +70359,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .unused,
                         .unused,
                         .unused,
@@ -70433,7 +70433,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70459,7 +70459,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70485,7 +70485,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
@@ -70518,7 +70518,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
@@ -70551,7 +70551,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } },
@@ -70584,7 +70584,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_mut_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
@@ -70617,7 +70617,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_32_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .repeat = 2, .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .unused,
@@ -70644,7 +70644,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .none, .none } },
                     },
                     .extra_temps = .{
-                        .{ .type = .vector_32_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
@@ -70676,7 +70676,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .type = .vector_32_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .repeat = 2, .size = .xword } } },
                         .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
@@ -70712,7 +70712,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .type = .vector_16_u8, .kind = .forward_bits_mem },
+                        .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } },
                         .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
@@ -129477,8 +129477,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 switch (reduce.operation) {
                     .And, .Or, .Xor => unreachable,
                     .Min, .Max => break :fallback try cg.airReduce(inst),
-                    .Add => {},
-                    .Mul => break :fallback try cg.airReduce(inst),
+                    .Add, .Mul => {},
                 }
                 var ops = try cg.tempsFromOperands(inst, .{reduce.operand});
                 var res: [1]Temp = undefined;
@@ -129894,7 +129893,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx512f, .f16c, null, null },
+                        .required_features = .{ .avx512f, null, null, null },
                         .dst_constraints = .{ .{ .float = .word }, .any },
                         .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
                         .patterns = &.{
@@ -129938,7 +129937,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx512f, .f16c, null, null },
+                        .required_features = .{ .avx512f, null, null, null },
                         .dst_constraints = .{ .{ .float = .word }, .any },
                         .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
                         .patterns = &.{
@@ -130051,7 +130050,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
                         } },
                     }, .{
-                        .required_features = .{ .avx512f, .f16c, null, null },
+                        .required_features = .{ .avx512f, null, null, null },
                         .dst_constraints = .{ .{ .float = .word }, .any },
                         .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
                         .patterns = &.{
@@ -130685,10 +130684,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
-                            .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ },
-                            .{ ._, .v_ps, .mova, .tmp2y, .memd(.src0y, 32), ._, ._ },
-                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .lea(.tmp0y), ._ },
-                            .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ },
+                            .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .memd(.src0y, 32), ._ },
                             .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ },
                             .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
                             .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ },
@@ -131117,33 +131116,1720 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .each = .{ .once = &.{
                             .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
                             .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
-                            .{ ._, ._sd, .add, .dst0x, .tmp0x, ._, ._ },
-                        } },
-                    }, .{
-                        .required_features = .{ .avx, .fast_hops, null, null },
-                        .dst_constraints = .{ .{ .float = .qword }, .any },
-                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_sse, .none, .none } },
-                        },
-                        .extra_temps = .{
-                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                        },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
-                        .each = .{ .once = &.{
-                            .{ ._, .vh_pd, .add, .tmp0x, .src0x, .src0x, ._ },
-                            .{ ._, .v_f128, .extract, .dst0x, .src0y, .ui(1), ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                            .{ ._, ._sd, .add, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_pd, .add, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_f128, .extract, .dst0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .tmp0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .dst0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .vh_pd, .add, .dst0y, .src0y, .src0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ },
+                            .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .memd(.src0y, 32), ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_pd, .add, .dst0y, .dst0y, .dst0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .h_pd, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ },
+                            .{ ._, ._sd, .add, .dst0x, .tmp1x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
+                            .{ .@"0:", .v_pd, .add, .tmp2y, .tmp2y, .memid(.src0y, .tmp0, 32), ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .vh_pd, .add, .dst0x, .dst0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ .@"0:", .v_pd, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse3, .fast_hops, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .h_pd, .add, .dst0x, .dst0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp2x, .tmp2x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp2x, .dst0x, ._, ._ },
+                            .{ ._, ._pd, .add, .dst0x, .tmp2x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .x87, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f64, .kind = .{ .reg = .st7 } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .mem, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0p, .sia(-16, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .f_, .ld, .memad(.src0q, .add_unaligned_size, -8), ._, ._, ._ },
+                            .{ .@"0:", .f_, .add, .memi(.src0q, .tmp0), ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0p, .si(8), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .f_p, .st, .dst0q, ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .x87, null, null, null },
+                        .dst_constraints = .{ .{ .float = .tbyte }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .qword, .is = .tbyte } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                            .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .mem, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0p, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .f_, .ld, .memad(.src0t, .add_unaligned_size, -16), ._, ._, ._ },
+                            .{ .@"0:", .f_, .ld, .memi(.src0t, .tmp0), ._, ._, ._ },
+                            .{ ._, .f_p, .add, ._, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0p, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .f_p, .st, .dst0t, ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .xword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .v_dqa, .mov, .tmp1x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .xword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._dqa, .mov, .tmp1x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .xword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .mova, .tmp1x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    } },
+                    .Mul => comptime &.{ .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .dword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0x, .src0x, .lea(.tmp0x), ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"or", .dst0x, .src0x, .lea(.tmp0x), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .mem, .none, .none } },
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_16_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp0y, .tmp0x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_32_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memd(.src0y, 32), ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, .v_f128, .extract, .tmp4x, .tmp3y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .tmp3y, .tmp3y, .tmp4y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp4x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp4y, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .mova, .tmp1y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .tmp1y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp1y, .tmp1x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .tmp1y, .tmp1y, .tmp2y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp2y, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp1y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp1y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp1y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_32_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-80, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp4x, .tmp3y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .tmp3y, .tmp3y, .tmp4y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp4x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp4y, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp3y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .f16c, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_16_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ .@"0:", .v_ps, .cvtph2, .tmp3y, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__mulhf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, .vp_, .xor, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .vp_w, .insr, .dst0x, .dst0x, .memad(.src0w, .add_unaligned_size, -2), .ui(0) },
+                            .{ .@"0:", .vp_, .xor, .tmp1x, .tmp1x, .tmp1x, ._ },
+                            .{ ._, .vp_w, .insr, .tmp1x, .tmp1x, .memi(.src0w, .tmp0), .ui(0) },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__mulhf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, .p_, .xor, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .p_w, .insr, .dst0x, .memad(.src0w, .add_unaligned_size, -2), .ui(0), ._ },
+                            .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
+                            .{ ._, .p_w, .insr, .tmp1x, .memi(.src0w, .tmp0), .ui(0), ._ },
+                            .{ ._, ._, .call, .tmp2d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .word }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .call_frame = .{ .alignment = .@"16" },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .f16, .kind = .{ .reg = .ax } },
+                            .{ .type = .f32, .kind = .mem },
+                            .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__mulhf3" } } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .reg = .xmm0 }, .unused },
+                        .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._, .movzx, .tmp1d, .memad(.src0w, .add_unaligned_size, -2), ._, ._ },
+                            .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ },
+                            .{ ._, ._ss, .mov, .dst0x, .mem(.tmp2d), ._, ._ },
+                            .{ .@"0:", ._ps, .xor, .tmp3x, .tmp3x, ._, ._ },
+                            .{ ._, ._ss, .mov, .tmp3x, .memi(.src0d, .tmp0), ._, ._ },
+                            .{ ._, ._, .call, .tmp4d, ._, ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .shuf, .tmp0x, .src0x, .src0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .src0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .mova, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp0x, .tmp0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .mul, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_ps, .mul, .tmp0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .dst0x, .src0x, .src0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .tmp0x, .dst0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ss, .mul, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .dst0x, .src0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .mul, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .src0x, .src0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .src0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._ps, .mul, .dst0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .mova, .tmp0x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp0x, .tmp0x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .mul, .dst0x, .tmp0x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .src0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_f128, .extract, .tmp1x, .src0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .src0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_16_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ },
+                            .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memd(.src0y, 32), ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
+                            .{ .@"0:", .v_ps, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
+                            .{ .@"0:", ._ps, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .mul, .dst0x, .tmp1x, ._, ._ },
+                            .{ ._, ._ps, .mova, .tmp1x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp1x, .tmp1x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .mul, .dst0x, .tmp1x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx512f, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_16_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ },
+                            .{ .@"0:", .v_ps, .mul, .tmp3y, .tmp3y, .memid(.src0y, .tmp0, 32), ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ .@"0:", .v_ps, .mul, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) },
+                            .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse, null, null, null },
+                        .dst_constraints = .{ .{ .float = .dword }, .any },
+                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mem, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
+                            .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .rc = .sse }, .unused },
+                        .clobbers = .{ .eflags = true },
+                        .each = .{ .once = &.{
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                            .{ ._, ._ps, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, ._ps, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
+                            .{ ._, ._ps, .@"or", .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ .@"0:", ._ps, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
+                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp3x, .tmp3x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp3x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .mul, .dst0x, .tmp3x, ._, ._ },
+                            .{ ._, ._ps, .mova, .tmp3x, .dst0x, ._, ._ },
+                            .{ ._, ._ps, .shuf, .tmp3x, .tmp3x, .ui(0b01_01_01_01), ._ },
+                            .{ ._, ._ss, .mul, .dst0x, .tmp3x, ._, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .avx, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ },
+                            .{ ._, .v_sd, .mul, .dst0x, .src0x, .tmp0x, ._ },
+                        } },
+                    }, .{
+                        .required_features = .{ .sse2, null, null, null },
+                        .dst_constraints = .{ .{ .float = .qword }, .any },
+                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
+                        .patterns = &.{
+                            .{ .src = .{ .to_mut_sse, .none, .none } },
+                        },
+                        .extra_temps = .{
+                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                            .unused,
+                        },
+                        .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                        .each = .{ .once = &.{
+                            .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ },
+                            .{ ._, ._sd, .mul, .dst0x, .tmp0x, ._, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
@@ -131168,35 +132854,9 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .tmp0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_pd, .mul, .tmp0x, .src0x, .tmp0x, ._ },
                             .{ ._, .v_ps, .movhl, .dst0x, .src0x, .src0x, ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .tmp0x, .dst0x, ._ },
-                        } },
-                    }, .{
-                        .required_features = .{ .avx, .fast_hops, null, null },
-                        .dst_constraints = .{ .{ .float = .qword }, .any },
-                        .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_sse, .none, .none } },
-                        },
-                        .extra_temps = .{
-                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                        },
-                        .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
-                        .each = .{ .once = &.{
-                            .{ ._, .vh_pd, .add, .dst0y, .src0y, .src0y, ._ },
-                            .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_sd, .mul, .dst0x, .tmp0x, .dst0x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
@@ -131221,9 +132881,9 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .src0x, .tmp0x, ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .src0x, .tmp0x, ._ },
                             .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx512f, null, null, null },
@@ -131235,6 +132895,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
                             .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
@@ -131243,20 +132904,22 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                             .unused,
-                            .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
-                            .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ },
-                            .{ ._, .v_pd, .mova, .tmp2y, .memd(.src0y, 32), ._, ._ },
-                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .lea(.tmp0y), ._ },
-                            .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .lead(.tmp0y, 32), ._ },
-                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ },
-                            .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
-                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp2x, ._ },
+                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
+                            .{ ._, .v_pd, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ },
+                            .{ ._, .v_pd, .@"and", .tmp3y, .tmp3y, .memd(.src0y, 32), ._ },
+                            .{ ._, .v_pd, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_pd, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, .v_pd, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_i128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp3x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx512f, null, null, null },
@@ -131282,43 +132945,11 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .each = .{ .once = &.{
                             .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ },
                             .{ ._, .v_pd, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ },
-                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp0y, ._ },
+                            .{ ._, .v_pd, .mul, .dst0y, .dst0y, .tmp0y, ._ },
                             .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp0x, ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp0x, ._ },
                             .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp0x, ._ },
-                        } },
-                    }, .{
-                        .required_features = .{ .avx, .fast_hops, null, null },
-                        .dst_constraints = .{ .{ .float = .qword }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mem, .none, .none } },
-                        },
-                        .extra_temps = .{
-                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                        },
-                        .dst_temps = .{ .{ .rc = .sse }, .unused },
-                        .clobbers = .{ .eflags = true },
-                        .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
-                            .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
-                            .{ .@"0:", .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
-                            .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
-                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, .vh_pd, .add, .dst0y, .dst0y, .dst0y, ._ },
-                            .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp0x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
@@ -131345,43 +132976,13 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ },
-                            .{ .@"0:", .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ .@"0:", .v_pd, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
                             .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp1x, ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp1x, ._ },
                             .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp1x, ._ },
-                        } },
-                    }, .{
-                        .required_features = .{ .sse3, .fast_hops, null, null },
-                        .dst_constraints = .{ .{ .float = .qword }, .any },
-                        .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mem, .none, .none } },
-                        },
-                        .extra_temps = .{
-                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                        },
-                        .dst_temps = .{ .{ .rc = .sse }, .unused },
-                        .clobbers = .{ .eflags = true },
-                        .each = .{ .once = &.{
-                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
-                            .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
-                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
-                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
-                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, .h_pd, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp1x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
@@ -131408,12 +133009,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ },
-                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ .@"0:", ._pd, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
                             .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
                             .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ },
-                            .{ ._, ._sd, .add, .dst0x, .tmp1x, ._, ._ },
+                            .{ ._, ._sd, .mul, .dst0x, .tmp1x, ._, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx512f, null, null, null },
@@ -131425,6 +133026,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_8_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
                             .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
@@ -131433,60 +133035,28 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                             .unused,
-                            .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
-                            .{ ._, .v_pd, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ },
-                            .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ },
-                            .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .memad(.src0y, .add_size, -32), ._ },
+                            .{ ._, .v_pd, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
+                            .{ ._, .v_pd, .@"and", .tmp3y, .tmp3y, .memad(.src0y, .add_size, -32), ._ },
                             .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ },
-                            .{ .@"0:", .v_pd, .add, .tmp2y, .tmp2y, .memid(.src0y, .tmp0, 32), ._ },
-                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
+                            .{ ._, .v_pd, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, .v_pd, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ },
+                            .{ .@"0:", .v_pd, .mul, .tmp3y, .tmp3y, .memid(.src0y, .tmp0, 32), ._ },
+                            .{ ._, .v_pd, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ },
                             .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ },
-                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
-                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
-                            .{ ._, .v_sd, .add, .dst0x, .dst0x, .tmp2x, ._ },
-                        } },
-                    }, .{
-                        .required_features = .{ .avx, .fast_hops, null, null },
-                        .dst_constraints = .{ .{ .float = .qword }, .any },
-                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mem, .none, .none } },
-                        },
-                        .extra_temps = .{
-                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                            .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
-                            .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                        },
-                        .dst_temps = .{ .{ .rc = .sse }, .unused },
-                        .clobbers = .{ .eflags = true },
-                        .each = .{ .once = &.{
-                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
-                            .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
-                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
-                            .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
-                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
-                            .{ .@"0:", .v_pd, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
-                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
-                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, .vh_pd, .add, .dst0x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_pd, .mul, .dst0y, .dst0y, .tmp3y, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp3x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .avx, null, null, null },
@@ -131498,6 +133068,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_4_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
                             .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
@@ -131506,54 +133077,23 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                             .unused,
-                            .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ },
-                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
                             .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ },
-                            .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
-                            .{ .@"0:", .v_pd, .add, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
-                            .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
-                            .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, .v_ps, .movhl, .tmp2x, .dst0x, .dst0x, ._ },
-                            .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ },
-                        } },
-                    }, .{
-                        .required_features = .{ .sse3, .fast_hops, null, null },
-                        .dst_constraints = .{ .{ .float = .qword }, .any },
-                        .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any },
-                        .patterns = &.{
-                            .{ .src = .{ .to_mem, .none, .none } },
-                        },
-                        .extra_temps = .{
-                            .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                            .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                            .unused,
-                        },
-                        .dst_temps = .{ .{ .rc = .sse }, .unused },
-                        .clobbers = .{ .eflags = true },
-                        .each = .{ .once = &.{
-                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
-                            .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
-                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
-                            .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
-                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, .v_pd, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ },
+                            .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ },
+                            .{ .@"0:", .v_pd, .mul, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ },
                             .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, .h_pd, .add, .dst0x, .dst0x, ._, ._ },
+                            .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ },
+                            .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ },
                         } },
                     }, .{
                         .required_features = .{ .sse2, null, null, null },
@@ -131565,6 +133105,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } },
+                            .{ .type = .vector_2_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } },
                             .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
@@ -131573,21 +133114,22 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .unused,
                             .unused,
                             .unused,
-                            .unused,
                         },
                         .dst_temps = .{ .{ .rc = .sse }, .unused },
                         .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
                             .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
                             .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ },
-                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ },
                             .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ },
-                            .{ .@"0:", ._pd, .add, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
+                            .{ ._, ._pd, .@"or", .dst0x, .lea(.tmp0x), ._, ._ },
+                            .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ },
+                            .{ .@"0:", ._pd, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ },
                             .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
-                            .{ ._, ._ps, .xor, .tmp2x, .tmp2x, ._, ._ },
-                            .{ ._, ._ps, .movhl, .tmp2x, .dst0x, ._, ._ },
-                            .{ ._, ._pd, .add, .dst0x, .tmp2x, ._, ._ },
+                            .{ ._, ._ps, .xor, .tmp3x, .tmp3x, ._, ._ },
+                            .{ ._, ._ps, .movhl, .tmp3x, .dst0x, ._, ._ },
+                            .{ ._, ._pd, .mul, .dst0x, .tmp3x, ._, ._ },
                         } },
                     }, .{
                         .required_features = .{ .x87, null, null, null },
@@ -131614,7 +133156,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .each = .{ .once = &.{
                             .{ ._, ._, .mov, .tmp0p, .sia(-16, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .f_, .ld, .memad(.src0q, .add_unaligned_size, -8), ._, ._, ._ },
-                            .{ .@"0:", .f_, .add, .memi(.src0q, .tmp0), ._, ._, ._ },
+                            .{ .@"0:", .f_, .mul, .memi(.src0q, .tmp0), ._, ._, ._ },
                             .{ ._, ._, .sub, .tmp0p, .si(8), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .f_p, .st, .dst0q, ._, ._, ._ },
@@ -131645,7 +133187,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._, .mov, .tmp0p, .sia(-32, .src0, .add_unaligned_size), ._, ._ },
                             .{ ._, .f_, .ld, .memad(.src0t, .add_unaligned_size, -16), ._, ._, ._ },
                             .{ .@"0:", .f_, .ld, .memi(.src0t, .tmp0), ._, ._, ._ },
-                            .{ ._, .f_p, .add, ._, ._, ._, ._ },
+                            .{ ._, .f_p, .mul, ._, ._, ._, ._ },
                             .{ ._, ._, .sub, .tmp0p, .si(16), ._, ._ },
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                             .{ ._, .f_p, .st, .dst0t, ._, ._, ._ },
@@ -131661,7 +133203,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
-                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__multf3" } } },
                             .unused,
                             .unused,
                             .unused,
@@ -131692,7 +133234,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
-                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__multf3" } } },
                             .unused,
                             .unused,
                             .unused,
@@ -131723,7 +133265,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .extra_temps = .{
                             .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
                             .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
-                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__addtf3" } } },
+                            .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__multf3" } } },
                             .unused,
                             .unused,
                             .unused,
@@ -131744,7 +133286,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                             .{ ._, ._nb, .j, .@"0b", ._, ._, ._ },
                         } },
                     } },
-                    .Mul => unreachable,
                 }) catch |err| switch (err) {
                     error.SelectFailed => return cg.fail("failed to select {s} {} {}", .{
                         @tagName(air_tag),
@@ -149031,7 +150572,7 @@ fn genCopy(self: *CodeGen, ty: Type, dst_mcv: MCValue, src_mcv: MCValue, opts: C
                             } else if (self.hasFeature(.sse4_1)) {
                                 try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[0].to64(), src_reg.to128());
                                 try self.asmRegisterRegisterImmediate(.{ .p_q, .extr }, dst_regs[1].to64(), src_reg.to128(), .u(1));
-                            } else {
+                            } else if (self.hasFeature(.sse2)) {
                                 const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
                                 const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
                                 defer self.register_manager.unlockReg(tmp_lock);
@@ -149039,6 +150580,19 @@ fn genCopy(self: *CodeGen, ty: Type, dst_mcv: MCValue, src_mcv: MCValue, opts: C
                                 try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[0].to64(), src_reg.to128());
                                 try self.asmRegisterRegister(.{ ._ps, .movhl }, tmp_reg.to128(), src_reg.to128());
                                 try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[1].to64(), tmp_reg.to128());
+                            } else {
+                                const frame_index = try self.allocFrameIndex(.init(.{
+                                    .size = 16,
+                                    .alignment = .@"16",
+                                }));
+                                try self.asmMemoryRegister(.{ ._ps, .mova }, .{
+                                    .base = .{ .frame = frame_index },
+                                    .mod = .{ .rm = .{ .size = .xword } },
+                                }, src_reg.to128());
+                                for (dst_regs, 0..) |dst_reg, dst_index| try self.asmRegisterMemory(.{ ._, .mov }, dst_reg.to64(), .{
+                                    .base = .{ .frame = frame_index },
+                                    .mod = .{ .rm = .{ .size = .qword, .disp = @intCast(8 * dst_index) } },
+                                });
                             }
                             return;
                         } else unreachable,
@@ -149282,7 +150836,7 @@ fn genSetReg(
                 },
             ),
             .x87 => switch (src_reg.class()) {
-                .general_purpose, .gphi, .segment => unreachable,
+                .general_purpose, .gphi, .segment, .mmx, .ip, .cr, .dr => unreachable,
                 .x87 => switch (src_reg) {
                     .st0 => try self.asmRegister(.{ .f_, .st }, dst_reg),
                     .st1, .st2, .st3, .st4, .st5, .st6 => switch (dst_reg) {
@@ -149307,7 +150861,25 @@ fn genSetReg(
                     },
                     else => unreachable,
                 },
-                .mmx, .sse, .ip, .cr, .dr => unreachable,
+                .sse => if (abi_size <= 16) {
+                    const frame_index = try self.allocFrameIndex(.init(.{
+                        .size = 16,
+                        .alignment = .@"16",
+                    }));
+                    try self.asmMemoryRegister(if (self.hasFeature(.avx))
+                        .{ .v_dqa, .mov }
+                    else if (self.hasFeature(.sse2))
+                        .{ ._dqa, .mov }
+                    else
+                        .{ ._ps, .mova }, .{
+                        .base = .{ .frame = frame_index },
+                        .mod = .{ .rm = .{ .size = .xword } },
+                    }, src_reg.to128());
+                    try MoveStrategy.read(.load_store_x87, self, dst_reg, .{
+                        .base = .{ .frame = frame_index },
+                        .mod = .{ .rm = .{ .size = .tbyte } },
+                    });
+                } else unreachable,
             },
             .mmx => unreachable,
             .sse => switch (src_reg.class()) {
@@ -149349,7 +150921,7 @@ fn genSetReg(
                     .{ .register = try self.copyToTmpRegister(ty, src_mcv) },
                     opts,
                 ),
-                .x87 => {
+                .x87 => if (abi_size <= 16) {
                     const frame_index = try self.allocFrameIndex(.init(.{
                         .size = 16,
                         .alignment = .@"16",
@@ -149367,7 +150939,7 @@ fn genSetReg(
                         .base = .{ .frame = frame_index },
                         .mod = .{ .rm = .{ .size = .xword } },
                     });
-                },
+                } else unreachable,
                 .mmx, .ip, .cr, .dr => unreachable,
                 .sse => try self.asmRegisterRegister(
                     @as(?Mir.Inst.FixedTag, switch (ty.scalarType(zcu).zigTypeTag(zcu)) {
@@ -149431,7 +151003,7 @@ fn genSetReg(
                     } else if (self.hasFeature(.sse4_1)) {
                         try self.asmRegisterRegister(.{ ._q, .mov }, dst_reg.to128(), src_regs[0].to64());
                         try self.asmRegisterRegisterImmediate(.{ .p_q, .insr }, dst_reg.to128(), src_regs[1].to64(), .u(1));
-                    } else {
+                    } else if (self.hasFeature(.sse2)) {
                         const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
                         const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
                         defer self.register_manager.unlockReg(tmp_lock);
@@ -149439,6 +151011,19 @@ fn genSetReg(
                         try self.asmRegisterRegister(.{ ._q, .mov }, dst_reg.to128(), src_regs[0].to64());
                         try self.asmRegisterRegister(.{ ._q, .mov }, tmp_reg.to128(), src_regs[1].to64());
                         try self.asmRegisterRegister(.{ ._ps, .movlh }, dst_reg.to128(), tmp_reg.to128());
+                    } else {
+                        const frame_index = try self.allocFrameIndex(.init(.{
+                            .size = 16,
+                            .alignment = .@"16",
+                        }));
+                        for (src_regs, 0..) |src_reg, src_index| try self.asmMemoryRegister(.{ ._, .mov }, .{
+                            .base = .{ .frame = frame_index },
+                            .mod = .{ .rm = .{ .size = .qword, .disp = @intCast(8 * src_index) } },
+                        }, src_reg.to64());
+                        try self.asmRegisterMemory(.{ ._ps, .mova }, dst_reg.to128(), .{
+                            .base = .{ .frame = frame_index },
+                            .mod = .{ .rm = .{ .size = .xword } },
+                        });
                     }
                 } else unreachable,
                 else => unreachable,
@@ -149746,11 +151331,18 @@ fn genSetMem(
                 },
                 else => abi_size,
             };
-            const src_alias = registerAlias(src_reg, abi_size);
-            const src_size: u32 = @intCast(switch (src_alias.class()) {
-                .general_purpose, .gphi, .segment, .x87, .ip, .cr, .dr => @divExact(src_alias.bitSize(), 8),
+            const src_alias = registerAlias(src_reg, @intCast(self.unalignedSize(ty)));
+            const src_class = src_alias.class();
+            const src_size: u32 = switch (src_class) {
+                .general_purpose, .gphi, .segment, .ip, .cr, .dr => @intCast(@divExact(src_alias.bitSize(), 8)),
                 .mmx, .sse => abi_size,
-            });
+                .x87 => switch (abi.classifySystemV(ty, zcu, self.target, .other)[0]) {
+                    else => unreachable,
+                    .float, .float_combine => 4,
+                    .sse => 8,
+                    .x87 => 10,
+                },
+            };
             const src_align: InternPool.Alignment = .fromNonzeroByteUnits(
                 std.math.ceilPowerOfTwoAssert(u32, src_size),
             );
@@ -149760,7 +151352,7 @@ fn genSetMem(
                     .alignment = src_align,
                 }));
                 const frame_mcv: MCValue = .{ .load_frame = .{ .index = frame_index } };
-                try (try self.moveStrategy(ty, src_alias.class(), true)).write(
+                try (try self.moveStrategy(ty, src_class, true)).write(
                     self,
                     .{ .base = .{ .frame = frame_index }, .mod = .{ .rm = .{
                         .size = .fromSize(src_size),
@@ -149769,7 +151361,7 @@ fn genSetMem(
                 );
                 try self.genSetMem(base, disp, ty, frame_mcv, opts);
                 try self.freeValue(frame_mcv);
-            } else try (try self.moveStrategy(ty, src_alias.class(), switch (base) {
+            } else try (try self.moveStrategy(ty, src_class, switch (base) {
                 .none => src_align.check(@as(u32, @bitCast(disp))),
                 .reg => |reg| switch (reg) {
                     .es, .cs, .ss, .ds => src_align.check(@as(u32, @bitCast(disp))),
@@ -154146,7 +155738,7 @@ fn registerAlias(reg: Register, size_bytes: u32) Register {
             reg
         else
             unreachable,
-        .x87 => if (size_bytes >= 10 and size_bytes <= 16)
+        .x87 => if (size_bytes >= 4 and size_bytes <= 16)
             reg
         else
             unreachable,
@@ -154433,7 +156025,10 @@ fn promoteVarArg(self: *CodeGen, ty: Type) Type {
 fn unalignedSize(cg: *CodeGen, ty: Type) u64 {
     const zcu = cg.pt.zcu;
     return switch (zcu.intern_pool.indexToKey(ty.toIntern())) {
-        .vector_type => |vector_type| Type.fromInterned(vector_type.child).abiSize(zcu) * vector_type.len,
+        .vector_type => |vector_type| switch (vector_type.child) {
+            .bool_type => ty.abiSize(zcu),
+            else => Type.fromInterned(vector_type.child).abiSize(zcu) * vector_type.len,
+        },
         else => ty.abiSize(zcu),
     };
 }
@@ -155222,7 +156817,7 @@ const Temp = struct {
             else => |mcv| std.debug.panic("{s}: {}\n", .{ @src().fn_name, mcv }),
             .register => |val_reg| try src.readReg(opts.disp, val_ty, registerAlias(
                 val_reg,
-                @intCast(val_ty.abiSize(cg.pt.zcu)),
+                @intCast(cg.unalignedSize(val_ty)),
             ), cg),
             inline .register_pair, .register_triple, .register_quadruple => |val_regs| {
                 var disp = opts.disp;
@@ -160731,8 +162326,8 @@ const Select = struct {
             pand_mask_mem: struct { ref: Select.Operand.Ref, invert: bool = false },
             ptest_mask_mem: Select.Operand.Ref,
             pshufb_bswap_mem: struct { repeat: u4 = 1, size: Memory.Size, smear: u4 = 1 },
-            forward_bits_mem,
-            reverse_bits_mem,
+            bits_mem: enum { forward, reverse },
+            splat_float_mem: struct { ref: Select.Operand.Ref, val: f16, fill: enum { inside, outside } = .inside },
             frame: FrameIndex,
             lazy_symbol: struct { kind: link.File.LazySymbol.Kind, ref: Select.Operand.Ref = .none },
             symbol: *const struct { lib: ?[]const u8 = null, name: []const u8 },
@@ -161051,11 +162646,11 @@ const Select = struct {
                 .pand_mask_mem => |mask_spec| {
                     const zcu = pt.zcu;
                     assert(spec.type.isVector(zcu) and spec.type.childType(zcu).toIntern() == .u8_type);
-                    const ty = mask_spec.ref.typeOf(s);
-                    assert(ty.isVector(zcu));
+                    const ref_ty = mask_spec.ref.typeOf(s);
+                    assert(ref_ty.isVector(zcu));
                     var elem_buf: [64]u8 = undefined;
                     const elems = elem_buf[0..spec.type.vectorLen(zcu)];
-                    const mask_len: usize = @intCast(cg.unalignedSize(ty) % elems.len);
+                    const mask_len: usize = @intCast((cg.unalignedSize(ref_ty) - 1) % elems.len + 1);
                     const invert_mask: u8 = switch (mask_spec.invert) {
                         false => std.math.minInt(u8),
                         true => std.math.maxInt(u8),
@@ -161070,14 +162665,14 @@ const Select = struct {
                 .ptest_mask_mem => |mask_ref| {
                     const zcu = pt.zcu;
                     assert(spec.type.isVector(zcu) and spec.type.childType(zcu).toIntern() == .u8_type);
-                    const ty = mask_ref.typeOf(s);
-                    assert(ty.isVector(zcu) and ty.childType(zcu).toIntern() == .bool_type);
+                    const ref_ty = mask_ref.typeOf(s);
+                    assert(ref_ty.isVector(zcu) and ref_ty.childType(zcu).toIntern() == .bool_type);
                     const mask_info = mask_ref.valueOf(s).register_mask.info;
                     var elem_buf: [64]u8 = @splat(0);
                     const elems = elem_buf[0..spec.type.vectorLen(zcu)];
                     const elem_bytes: u6 = @intCast(@divExact(mask_info.scalar.bitSize(cg.target), 8));
                     var index: u7 = 0;
-                    for (0..@intCast(ty.vectorLen(zcu))) |_| {
+                    for (0..@intCast(ref_ty.vectorLen(zcu))) |_| {
                         switch (mask_info.kind) {
                             .sign => {
                                 @memset(elems[index..][0 .. elem_bytes - 1], std.math.minInt(u8));
@@ -161108,21 +162703,42 @@ const Select = struct {
                         .storage = .{ .bytes = try zcu.intern_pool.getOrPutString(zcu.gpa, pt.tid, elems, .maybe_embedded_nulls) },
                     } }))), true };
                 },
-                .forward_bits_mem, .reverse_bits_mem => {
+                .bits_mem => |direction| {
                     const zcu = pt.zcu;
                     assert(spec.type.isVector(zcu) and spec.type.childType(zcu).toIntern() == .u8_type);
                     var bytes: [32]u8 = undefined;
                     const elems = bytes[0..spec.type.vectorLen(zcu)];
-                    for (elems, 0..) |*elem, index| elem.* = switch (spec.kind) {
-                        else => unreachable,
-                        .forward_bits_mem => @as(u8, 1 << 0) << @truncate(index),
-                        .reverse_bits_mem => @as(u8, 1 << 7) >> @truncate(index),
+                    for (elems, 0..) |*elem, index| elem.* = switch (direction) {
+                        .forward => @as(u8, 1 << 0) << @truncate(index),
+                        .reverse => @as(u8, 1 << 7) >> @truncate(index),
                     };
                     return .{ try cg.tempMemFromValue(.fromInterned(try pt.intern(.{ .aggregate = .{
                         .ty = spec.type.toIntern(),
                         .storage = .{ .bytes = try zcu.intern_pool.getOrPutString(zcu.gpa, pt.tid, elems, .maybe_embedded_nulls) },
                     } }))), true };
                 },
+                .splat_float_mem => |splat_spec| {
+                    const zcu = pt.zcu;
+                    assert(spec.type.isVector(zcu));
+                    const elem_ty = spec.type.childType(zcu);
+                    const ref_ty = splat_spec.ref.typeOf(s);
+                    assert(ref_ty.isVector(zcu) and ref_ty.childType(zcu).toIntern() == elem_ty.toIntern());
+                    var elem_buf: [@divExact(64, 2)]InternPool.Index = undefined;
+                    const elems = elem_buf[0..spec.type.vectorLen(zcu)];
+                    const inside_len = (ref_ty.vectorLen(zcu) - 1) % elems.len + 1;
+                    @memset(elems[0..inside_len], (try pt.floatValue(elem_ty, switch (splat_spec.fill) {
+                        .inside => splat_spec.val,
+                        .outside => 0.0,
+                    })).toIntern());
+                    @memset(elems[inside_len..], (try pt.floatValue(elem_ty, switch (splat_spec.fill) {
+                        .inside => 0.0,
+                        .outside => splat_spec.val,
+                    })).toIntern());
+                    return .{ try cg.tempMemFromValue(.fromInterned(try pt.intern(.{ .aggregate = .{
+                        .ty = spec.type.toIntern(),
+                        .storage = .{ .elems = elems },
+                    } }))), true };
+                },
                 .frame => |frame_index| .{ try cg.tempInit(spec.type, .{ .load_frame = .{ .index = frame_index } }), true },
                 .lazy_symbol => |lazy_symbol_spec| {
                     const ip = &pt.zcu.intern_pool;
src/codegen/c/Type.zig
@@ -1885,6 +1885,36 @@ pub const Pool = struct {
                 };
                 return pool.fromFields(allocator, .@"struct", &fields, kind);
             },
+            .vector_16_f16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f16,
+                    .len = 16,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
+            .vector_32_f16_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f16,
+                    .len = 32,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f16.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
             .vector_2_f32_type => {
                 const vector_ctype = try pool.getVector(allocator, .{
                     .elem_ctype = .f32,
@@ -1930,6 +1960,21 @@ pub const Pool = struct {
                 };
                 return pool.fromFields(allocator, .@"struct", &fields, kind);
             },
+            .vector_16_f32_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f32,
+                    .len = 16,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f32.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
             .vector_2_f64_type => {
                 const vector_ctype = try pool.getVector(allocator, .{
                     .elem_ctype = .f64,
@@ -1960,6 +2005,21 @@ pub const Pool = struct {
                 };
                 return pool.fromFields(allocator, .@"struct", &fields, kind);
             },
+            .vector_8_f64_type => {
+                const vector_ctype = try pool.getVector(allocator, .{
+                    .elem_ctype = .f64,
+                    .len = 8,
+                });
+                if (!kind.isParameter()) return vector_ctype;
+                var fields = [_]Info.Field{
+                    .{
+                        .name = .{ .index = .array },
+                        .ctype = vector_ctype,
+                        .alignas = AlignAs.fromAbiAlignment(Type.f64.abiAlignment(zcu)),
+                    },
+                };
+                return pool.fromFields(allocator, .@"struct", &fields, kind);
+            },
 
             .undef,
             .zero,
src/Air.zig
@@ -1038,11 +1038,15 @@ pub const Inst = struct {
         vector_1_u256_type = @intFromEnum(InternPool.Index.vector_1_u256_type),
         vector_4_f16_type = @intFromEnum(InternPool.Index.vector_4_f16_type),
         vector_8_f16_type = @intFromEnum(InternPool.Index.vector_8_f16_type),
+        vector_16_f16_type = @intFromEnum(InternPool.Index.vector_16_f16_type),
+        vector_32_f16_type = @intFromEnum(InternPool.Index.vector_32_f16_type),
         vector_2_f32_type = @intFromEnum(InternPool.Index.vector_2_f32_type),
         vector_4_f32_type = @intFromEnum(InternPool.Index.vector_4_f32_type),
         vector_8_f32_type = @intFromEnum(InternPool.Index.vector_8_f32_type),
+        vector_16_f32_type = @intFromEnum(InternPool.Index.vector_16_f32_type),
         vector_2_f64_type = @intFromEnum(InternPool.Index.vector_2_f64_type),
         vector_4_f64_type = @intFromEnum(InternPool.Index.vector_4_f64_type),
+        vector_8_f64_type = @intFromEnum(InternPool.Index.vector_8_f64_type),
         optional_noreturn_type = @intFromEnum(InternPool.Index.optional_noreturn_type),
         anyerror_void_error_union_type = @intFromEnum(InternPool.Index.anyerror_void_error_union_type),
         adhoc_inferred_error_set_type = @intFromEnum(InternPool.Index.adhoc_inferred_error_set_type),
src/InternPool.zig
@@ -4615,11 +4615,15 @@ pub const Index = enum(u32) {
     vector_1_u256_type,
     vector_4_f16_type,
     vector_8_f16_type,
+    vector_16_f16_type,
+    vector_32_f16_type,
     vector_2_f32_type,
     vector_4_f32_type,
     vector_8_f32_type,
+    vector_16_f32_type,
     vector_2_f64_type,
     vector_4_f64_type,
+    vector_8_f64_type,
 
     optional_noreturn_type,
     anyerror_void_error_union_type,
@@ -5174,16 +5178,24 @@ pub const static_keys = [_]Key{
     .{ .vector_type = .{ .len = 4, .child = .f16_type } },
     // @Vector(8, f16)
     .{ .vector_type = .{ .len = 8, .child = .f16_type } },
+    // @Vector(16, f16)
+    .{ .vector_type = .{ .len = 16, .child = .f16_type } },
+    // @Vector(32, f16)
+    .{ .vector_type = .{ .len = 32, .child = .f16_type } },
     // @Vector(2, f32)
     .{ .vector_type = .{ .len = 2, .child = .f32_type } },
     // @Vector(4, f32)
     .{ .vector_type = .{ .len = 4, .child = .f32_type } },
     // @Vector(8, f32)
     .{ .vector_type = .{ .len = 8, .child = .f32_type } },
+    // @Vector(16, f32)
+    .{ .vector_type = .{ .len = 16, .child = .f32_type } },
     // @Vector(2, f64)
     .{ .vector_type = .{ .len = 2, .child = .f64_type } },
     // @Vector(4, f64)
     .{ .vector_type = .{ .len = 4, .child = .f64_type } },
+    // @Vector(8, f64)
+    .{ .vector_type = .{ .len = 8, .child = .f64_type } },
 
     // ?noreturn
     .{ .opt_type = .noreturn_type },
@@ -11847,11 +11859,15 @@ pub fn typeOf(ip: *const InternPool, index: Index) Index {
         .vector_1_u256_type,
         .vector_4_f16_type,
         .vector_8_f16_type,
+        .vector_16_f16_type,
+        .vector_32_f16_type,
         .vector_2_f32_type,
         .vector_4_f32_type,
         .vector_8_f32_type,
+        .vector_16_f32_type,
         .vector_2_f64_type,
         .vector_4_f64_type,
+        .vector_8_f64_type,
         .optional_noreturn_type,
         .anyerror_void_error_union_type,
         .adhoc_inferred_error_set_type,
@@ -12175,11 +12191,15 @@ pub fn zigTypeTag(ip: *const InternPool, index: Index) std.builtin.TypeId {
         .vector_1_u256_type,
         .vector_4_f16_type,
         .vector_8_f16_type,
+        .vector_16_f16_type,
+        .vector_32_f16_type,
         .vector_2_f32_type,
         .vector_4_f32_type,
         .vector_8_f32_type,
+        .vector_16_f32_type,
         .vector_2_f64_type,
         .vector_4_f64_type,
+        .vector_8_f64_type,
         => .vector,
 
         .optional_noreturn_type => .optional,
src/Sema.zig
@@ -36571,11 +36571,15 @@ pub fn typeHasOnePossibleValue(sema: *Sema, ty: Type) CompileError!?Value {
         .vector_1_u256_type,
         .vector_4_f16_type,
         .vector_8_f16_type,
+        .vector_16_f16_type,
+        .vector_32_f16_type,
         .vector_2_f32_type,
         .vector_4_f32_type,
         .vector_8_f32_type,
+        .vector_16_f32_type,
         .vector_2_f64_type,
         .vector_4_f64_type,
+        .vector_8_f64_type,
         .anyerror_void_error_union_type,
         => null,
         .void_type => Value.void,
src/Type.zig
@@ -4136,11 +4136,15 @@ pub const vector_2_u128: Type = .{ .ip_index = .vector_2_u128_type };
 pub const vector_1_u256: Type = .{ .ip_index = .vector_1_u256_type };
 pub const vector_4_f16: Type = .{ .ip_index = .vector_4_f16_type };
 pub const vector_8_f16: Type = .{ .ip_index = .vector_8_f16_type };
+pub const vector_16_f16: Type = .{ .ip_index = .vector_16_f16_type };
+pub const vector_32_f16: Type = .{ .ip_index = .vector_32_f16_type };
 pub const vector_2_f32: Type = .{ .ip_index = .vector_2_f32_type };
 pub const vector_4_f32: Type = .{ .ip_index = .vector_4_f32_type };
 pub const vector_8_f32: Type = .{ .ip_index = .vector_8_f32_type };
+pub const vector_16_f32: Type = .{ .ip_index = .vector_16_f32_type };
 pub const vector_2_f64: Type = .{ .ip_index = .vector_2_f64_type };
 pub const vector_4_f64: Type = .{ .ip_index = .vector_4_f64_type };
+pub const vector_8_f64: Type = .{ .ip_index = .vector_8_f64_type };
 
 pub const empty_tuple: Type = .{ .ip_index = .empty_tuple_type };
 
test/behavior/x86_64/math.zig
@@ -125,7 +125,7 @@ fn boolOr(lhs: anytype, rhs: @TypeOf(lhs)) @TypeOf(lhs) {
     @compileError("unsupported boolOr type: " ++ @typeName(@TypeOf(lhs)));
 }
 
-pub const Compare = enum { strict, relaxed, approx, approx_int };
+pub const Compare = enum { strict, relaxed, approx, approx_int, approx_or_overflow };
 // noinline for a more helpful stack trace
 pub noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected), comptime compare: Compare) !void {
     const Expected = @TypeOf(expected);
@@ -137,20 +137,32 @@ pub noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected), comp
                 break :unexpected switch (compare) {
                     .strict => boolOr(unequal, sign(expected) != sign(actual)),
                     .relaxed => unequal,
-                    .approx, .approx_int => comptime unreachable,
+                    .approx, .approx_int, .approx_or_overflow => comptime unreachable,
                 };
             },
-            .approx, .approx_int => {
+            .approx, .approx_int, .approx_or_overflow => {
                 const epsilon = math.floatEps(Scalar(Expected));
-                const tolerance = @sqrt(epsilon);
-                break :unexpected @abs(expected - actual) > @max(
+                const tolerance = switch (compare) {
+                    .strict, .relaxed => comptime unreachable,
+                    .approx, .approx_int => @sqrt(epsilon),
+                    .approx_or_overflow => @exp2(@log2(epsilon) * 0.4),
+                };
+                const approx_unequal = @abs(expected - actual) > @max(
                     @abs(expected) * splat(Expected, tolerance),
                     splat(Expected, switch (compare) {
                         .strict, .relaxed => comptime unreachable,
-                        .approx => tolerance,
+                        .approx, .approx_or_overflow => tolerance,
                         .approx_int => 1,
                     }),
                 );
+                break :unexpected switch (compare) {
+                    .strict, .relaxed => comptime unreachable,
+                    .approx, .approx_int => approx_unequal,
+                    .approx_or_overflow => boolAnd(approx_unequal, boolOr(boolAnd(
+                        @abs(expected) != splat(Expected, inf(Expected)),
+                        @abs(actual) != splat(Expected, inf(Expected)),
+                    ), sign(expected) != sign(actual))),
+                };
             },
         },
         .@"struct" => |@"struct"| inline for (@"struct".fields) |field| {
test/behavior/x86_64/unary.zig
@@ -5119,6 +5119,15 @@ test reduceAddOptimized {
     try test_reduce_add_optimized.testFloatVectors();
 }
 
+inline fn reduceMulOptimized(comptime Type: type, rhs: Type) @typeInfo(Type).vector.child {
+    @setFloatMode(.optimized);
+    return @reduce(.Mul, rhs);
+}
+test reduceMulOptimized {
+    const test_reduce_mul_optimized = unary(reduceMulOptimized, .{ .compare = .approx_or_overflow });
+    try test_reduce_mul_optimized.testFloatVectors();
+}
+
 inline fn splat(comptime Type: type, rhs: Type) Type {
     return @splat(rhs[0]);
 }
test/cases/compile_errors/@import_zon_bad_type.zig
@@ -117,9 +117,9 @@ export fn testMutablePointer() void {
 // tmp.zig:37:38: note: imported here
 // neg_inf.zon:1:1: error: expected type '?u8'
 // tmp.zig:57:28: note: imported here
-// neg_inf.zon:1:1: error: expected type 'tmp.testNonExhaustiveEnum__enum_505'
+// neg_inf.zon:1:1: error: expected type 'tmp.testNonExhaustiveEnum__enum_509'
 // tmp.zig:62:39: note: imported here
-// neg_inf.zon:1:1: error: expected type 'tmp.testUntaggedUnion__union_507'
+// neg_inf.zon:1:1: error: expected type 'tmp.testUntaggedUnion__union_511'
 // tmp.zig:67:44: note: imported here
-// neg_inf.zon:1:1: error: expected type 'tmp.testTaggedUnionVoid__union_510'
+// neg_inf.zon:1:1: error: expected type 'tmp.testTaggedUnionVoid__union_514'
 // tmp.zig:72:50: note: imported here
test/cases/compile_errors/anytype_param_requires_comptime.zig
@@ -15,6 +15,6 @@ pub export fn entry() void {
 // error
 //
 // :7:25: error: unable to resolve comptime value
-// :7:25: note: initializer of comptime-only struct 'tmp.S.foo__anon_479.C' must be comptime-known
+// :7:25: note: initializer of comptime-only struct 'tmp.S.foo__anon_483.C' must be comptime-known
 // :4:16: note: struct requires comptime because of this field
 // :4:16: note: types are not available at runtime
test/cases/compile_errors/bogus_method_call_on_slice.zig
@@ -16,5 +16,5 @@ pub export fn entry2() void {
 //
 // :3:6: error: no field or member function named 'copy' in '[]const u8'
 // :9:8: error: no field or member function named 'bar' in '@TypeOf(.{})'
-// :12:18: error: no field or member function named 'bar' in 'tmp.entry2__struct_483'
+// :12:18: error: no field or member function named 'bar' in 'tmp.entry2__struct_487'
 // :12:6: note: struct declared here
test/cases/compile_errors/coerce_anon_struct.zig
@@ -6,6 +6,6 @@ export fn foo() void {
 
 // error
 //
-// :4:16: error: expected type 'tmp.T', found 'tmp.foo__struct_472'
+// :4:16: error: expected type 'tmp.T', found 'tmp.foo__struct_476'
 // :3:16: note: struct declared here
 // :1:11: note: struct declared here
test/cases/compile_errors/redundant_try.zig
@@ -44,9 +44,9 @@ comptime {
 //
 // :5:23: error: expected error union type, found 'comptime_int'
 // :10:23: error: expected error union type, found '@TypeOf(.{})'
-// :15:23: error: expected error union type, found 'tmp.test2__struct_509'
+// :15:23: error: expected error union type, found 'tmp.test2__struct_513'
 // :15:23: note: struct declared here
-// :20:27: error: expected error union type, found 'tmp.test3__struct_511'
+// :20:27: error: expected error union type, found 'tmp.test3__struct_515'
 // :20:27: note: struct declared here
 // :25:23: error: expected error union type, found 'struct { comptime *const [5:0]u8 = "hello" }'
 // :31:13: error: expected error union type, found 'u32'