Commit 654da648b3

Jacob Young <jacobly0@users.noreply.github.com>
2025-01-26 15:05:04
x86_64: rewrite `@min`/`@max` for float vectors
1 parent 0c890bb
Changed files (2)
src
arch
test
behavior
x86_64
src/arch/x86_64/CodeGen.zig
@@ -2393,7 +2393,7 @@ fn genBodyBlock(self: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 }
 
 fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
-    @setEvalBranchQuota(3_600);
+    @setEvalBranchQuota(3_900);
     const pt = cg.pt;
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
@@ -2805,10 +2805,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 }
                 try res[0].finish(inst, &.{ bin_op.lhs, bin_op.rhs }, &ops, cg);
             },
-            .max => |air_tag| if (use_old) try cg.airBinOp(inst, air_tag) else fallback: {
+            .max => |air_tag| if (use_old) try cg.airBinOp(inst, air_tag) else {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
-                const ty = cg.typeOf(bin_op.lhs);
-                if (ty.isVector(zcu) and cg.floatBits(ty.childType(zcu)) != null) break :fallback try cg.airBinOp(inst, air_tag);
                 var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
                 var res: [1]Temp = undefined;
                 cg.select(&res, &.{cg.typeOf(bin_op.lhs)}, &ops, comptime &.{ .{
@@ -4510,7 +4508,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
+                        .{ .type = .vector_4_u32, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -4647,7 +4645,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
+                        .{ .type = .vector_4_u32, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
@@ -4967,7 +4965,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -4998,7 +4996,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .reg = .xmm0 } },
                         .unused,
@@ -5030,7 +5028,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -5059,7 +5057,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
@@ -5095,7 +5093,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
@@ -5131,7 +5129,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
@@ -5383,8 +5381,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .to_sse } },
                     },
                     .extra_temps = .{
-                        .{ .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
-                        .{ .kind = .{ .rc = .sse } },
+                        .{ .type = .f16, .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
+                        .{ .type = .f16, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -5395,12 +5393,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
-                        .{ ._, .v_ps, .cvtph2, .dst0x, .src0x, ._, ._ },
-                        .{ ._, .v_ps, .cvtph2, .tmp0x, .src1x, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp0x, .src1q, ._, ._ },
                         .{ ._, .v_ss, .cmp, .tmp1x, .dst0x, .dst0x, .vp(.unord) },
                         .{ ._, .v_ss, .max, .dst0x, .tmp0x, .dst0x, ._ },
                         .{ ._, .v_ps, .blendv, .dst0x, .dst0x, .tmp0x, .tmp1x },
-                        .{ ._, .v_, .cvtps2ph, .dst0x, .dst0x, .rm(.{}), ._ },
+                        .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
                     } },
                 }, .{
                     .required_features = .{ .sse, null, null, null },
@@ -5423,11 +5421,248 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .unused,
                         .unused,
                     },
-                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .each = .{ .once = &.{
                         .{ ._, ._, .call, .tmp0d, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .f16c, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .qword, .is = .word } },
+                        .{ .scalar_float = .{ .of = .qword, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .mem } },
+                        .{ .src = .{ .to_sse, .mem } },
+                        .{ .src = .{ .mem, .to_sse } },
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f16, .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
+                        .{ .type = .vector_4_f16, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp0x, .src1q, ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp1x, .dst0x, .dst0x, .vp(.unord) },
+                        .{ ._, .v_ps, .max, .dst0x, .tmp0x, .dst0x, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0x, .dst0x, .tmp0x, .tmp1x },
+                        .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .f16c, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .word } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .mem } },
+                        .{ .src = .{ .to_sse, .mem } },
+                        .{ .src = .{ .mem, .to_sse } },
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_8_f16, .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp0y, .src1x, ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp1y, .dst0y, .dst0y, .vp(.unord) },
+                        .{ ._, .v_ps, .max, .dst0y, .tmp0y, .dst0y, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0y, .dst0y, .tmp0y, .tmp1y },
+                        .{ ._, .v_, .cvtps2ph, .dst0q, .dst0y, .rm(.{}), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .f16c, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_ps, .cvtph2, .tmp1y, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp2y, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp3y, .tmp1y, .tmp1y, .vp(.unord) },
+                        .{ ._, .v_ps, .max, .tmp1y, .tmp2y, .tmp1y, ._ },
+                        .{ ._, .v_ps, .blendv, .tmp1y, .tmp1y, .tmp2y, .tmp3y },
+                        .{ ._, .v_, .cvtps2ph, .memia(.dst0x, .tmp0, .add_size), .tmp1y, .rm(.{}), ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fmaxh" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .vp_, .xor, .tmp2x, .tmp2x, .tmp2x, ._ },
+                        .{ ._, .vp_w, .insr, .tmp1x, .tmp2x, .memia(.src0w, .tmp0, .add_size), .ui(0) },
+                        .{ ._, .vp_w, .insr, .tmp2x, .tmp2x, .memia(.src1w, .tmp0, .add_size), .ui(0) },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .vp_w, .extr, .memia(.dst0w, .tmp0, .add_size), .tmp1x, .ui(0), ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fmaxh" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
+                        .{ ._, .p_, .xor, .tmp2x, .tmp2x, ._, ._ },
+                        .{ ._, .p_w, .insr, .tmp1x, .memia(.src0w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, .p_w, .insr, .tmp2x, .memia(.src1w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .p_w, .extr, .memia(.dst0w, .tmp0, .add_size), .tmp1x, .ui(0), ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fmaxh" } } },
+                        .{ .type = .f16, .kind = .{ .reg = .ax } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
+                        .{ ._, .p_, .xor, .tmp2x, .tmp2x, ._, ._ },
+                        .{ ._, .p_w, .insr, .tmp1x, .memia(.src0w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, .p_w, .insr, .tmp2x, .memia(.src1w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .p_w, .extr, .tmp4d, .tmp1x, .ui(0), ._ },
+                        .{ ._, ._, .mov, .memia(.dst0w, .tmp0, .add_size), .tmp4w, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .eax } },
+                        .{ .type = .f32, .kind = .mem },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fmaxh" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0w, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ },
+                        .{ ._, ._ss, .mov, .tmp3x, .mem(.tmp2d), ._, ._ },
+                        .{ ._, ._, .movzx, .tmp1d, .memia(.src1w, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ },
+                        .{ ._, ._ss, .mov, .tmp4x, .mem(.tmp2d), ._, ._ },
+                        .{ ._, ._, .call, .tmp5d, ._, ._, ._ },
+                        .{ ._, ._ss, .mov, .mem(.tmp2d), .tmp3x, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .mem(.tmp2d), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0w, .tmp0, .add_size), .tmp1w, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{
@@ -5438,7 +5673,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .to_sse } },
                     },
                     .extra_temps = .{
-                        .{ .kind = .{ .rc = .sse } },
+                        .{ .type = .f32, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -5499,6 +5734,210 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._ps, .andn, .dst0x, .src1x, ._, ._ },
                         .{ ._, ._ps, .@"or", .dst0x, .tmp0x, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cmp, .tmp0x, .src0x, .src0x, .vp(.unord) },
+                        .{ ._, .v_ps, .max, .dst0x, .src1x, .src0x, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0x, .dst0x, .src1x, .tmp0x },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .mem } },
+                        .{ .src = .{ .mem, .{ .to_reg = .xmm0 } }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .to_sse } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._ps, .mova, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._ps, .max, .dst0x, .src0x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .src0x, .src0x, .vp(.unord), ._ },
+                        .{ ._, ._ps, .blendv, .dst0x, .src1x, .src0x, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .mem } },
+                        .{ .src = .{ .mem, .to_mut_sse }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._ps, .mova, .tmp0x, .src1x, ._, ._ },
+                        .{ ._, ._ps, .max, .tmp0x, .src0x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .dst0x, .src0x, .vp(.ord), ._ },
+                        .{ ._, ._ps, .@"and", .tmp0x, .dst0x, ._, ._ },
+                        .{ ._, ._ps, .andn, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._ps, .@"or", .dst0x, .tmp0x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .yword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .yword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cmp, .tmp0y, .src0y, .src0y, .vp(.unord) },
+                        .{ ._, .v_ps, .max, .dst0y, .src1y, .src0y, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0y, .dst0y, .src1y, .tmp0y },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } },
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_ps, .mova, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .mova, .tmp2y, .memia(.src1y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp3y, .tmp1y, .tmp1y, .vp(.unord) },
+                        .{ ._, .v_ps, .max, .tmp1y, .tmp2y, .tmp1y, ._ },
+                        .{ ._, .v_ps, .blendv, .tmp1y, .tmp1y, .tmp2y, .tmp3y },
+                        .{ ._, .v_ps, .mova, .memia(.dst0y, .tmp0, .add_size), .tmp1y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_4_f32, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .max, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .tmp1x, .tmp1x, .vp(.unord), ._ },
+                        .{ ._, ._ps, .blendv, .tmp3x, .tmp2x, .tmp1x, ._ },
+                        .{ ._, ._ps, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp3x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .max, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .tmp1x, .tmp1x, .vp(.ord), ._ },
+                        .{ ._, ._ps, .@"and", .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .andn, .tmp1x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .@"or", .tmp1x, .tmp3x, ._, ._ },
+                        .{ ._, ._ps, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{
@@ -5509,7 +5948,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .to_sse } },
                     },
                     .extra_temps = .{
-                        .{ .kind = .{ .rc = .sse } },
+                        .{ .type = .f64, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -5591,11 +6030,249 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .unused,
                         .unused,
                     },
-                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .each = .{ .once = &.{
                         .{ ._, ._, .call, .tmp0d, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_pd, .cmp, .tmp0x, .src0x, .src0x, .vp(.unord) },
+                        .{ ._, .v_pd, .max, .dst0x, .src1x, .src0x, ._ },
+                        .{ ._, .v_pd, .blendv, .dst0x, .dst0x, .src1x, .tmp0x },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .mem } },
+                        .{ .src = .{ .mem, .{ .to_reg = .xmm0 } }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .to_sse } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._pd, .mova, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._pd, .max, .dst0x, .src0x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .src0x, .src0x, .vp(.unord), ._ },
+                        .{ ._, ._pd, .blendv, .dst0x, .src1x, .src0x, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .mem } },
+                        .{ .src = .{ .mem, .to_mut_sse }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._pd, .mova, .tmp0x, .src1x, ._, ._ },
+                        .{ ._, ._pd, .max, .tmp0x, .src0x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .dst0x, .src0x, .vp(.ord), ._ },
+                        .{ ._, ._pd, .@"and", .tmp0x, .dst0x, ._, ._ },
+                        .{ ._, ._pd, .andn, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._pd, .@"or", .dst0x, .tmp0x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .yword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .yword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_pd, .cmp, .tmp0y, .src0y, .src0y, .vp(.unord) },
+                        .{ ._, .v_pd, .max, .dst0y, .src1y, .src0y, ._ },
+                        .{ ._, .v_pd, .blendv, .dst0y, .dst0y, .src1y, .tmp0y },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_pd, .mova, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_pd, .mova, .tmp2y, .memia(.src1y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_pd, .cmp, .tmp3y, .tmp1y, .tmp1y, .vp(.unord) },
+                        .{ ._, .v_pd, .max, .tmp1y, .tmp2y, .tmp1y, ._ },
+                        .{ ._, .v_pd, .blendv, .tmp1y, .tmp1y, .tmp2y, .tmp3y },
+                        .{ ._, .v_pd, .mova, .memia(.dst0y, .tmp0, .add_size), .tmp1y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_2_f64, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._pd, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._pd, .max, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .tmp1x, .tmp1x, .vp(.unord), ._ },
+                        .{ ._, ._pd, .blendv, .tmp3x, .tmp2x, .tmp1x, ._ },
+                        .{ ._, ._pd, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp3x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._pd, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._pd, .max, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .tmp1x, .tmp1x, .vp(.ord), ._ },
+                        .{ ._, ._pd, .@"and", .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._pd, .andn, .tmp1x, .tmp2x, ._, ._ },
+                        .{ ._, ._pd, .@"or", .tmp1x, .tmp3x, ._, ._ },
+                        .{ ._, ._pd, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f64, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f64, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fmax" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .xor, .tmp2x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .movl, .tmp1x, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .movl, .tmp2x, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._ps, .movl, .memia(.dst0q, .tmp0, .add_size), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .x87, .cmov, null, null },
                     .src_constraints = .{
@@ -5750,6 +6427,172 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, .f_, .ld, .src1t, ._, ._, ._ },
                         .{ .@"1:", .f_p, .st, .dst0t, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .x87, .cmov, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ucomi, .tmp1t, .tmp1t, ._, ._ },
+                        .{ ._, .f_u, .cmov, .tmp1t, .tmp2t, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, .f_, .ucomi, .tmp1t, .tmp2t, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, .f_nb, .cmov, .tmp1t, .tmp2t, ._, ._ },
+                        .{ ._, .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sahf, .x87, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .{ .type = .u8, .kind = .{ .reg = .ah } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp1t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._p, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp2t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._b, .j, .@"2f", ._, ._, ._ },
+                        .{ .@"1:", .f_p, .st, .tmp1t, ._, ._, ._ },
+                        .{ ._, .f_, .ld, .tmp2t, ._, ._, ._ },
+                        .{ .@"2:", .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .x87, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .{ .type = .u8, .kind = .{ .reg = .ah } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .xam, ._, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .@"test", .tmp3b, .si(0b0_1_000_100), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp2t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .@"test", .tmp3b, .si(0b0_0_000_001), ._, ._ },
+                        .{ ._, ._nz, .j, .@"2f", ._, ._, ._ },
+                        .{ .@"1:", .f_p, .st, .tmp1t, ._, ._, ._ },
+                        .{ ._, .f_, .ld, .tmp2t, ._, ._, ._ },
+                        .{ .@"2:", .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .x87, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .{ .type = .u8, .kind = .{ .reg = .ah } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp1t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._p, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp2t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, .f_, .xch, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._b, .j, .@"2f", ._, ._, ._ },
+                        .{ .@"1:", .f_p, .st, .tmp1t, ._, ._, ._ },
+                        .{ ._, .f_, .ld, .tmp2t, ._, ._, ._ },
+                        .{ .@"2:", .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .sse, null, null, null },
                     .src_constraints = .{
@@ -5771,11 +6614,107 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .unused,
                         .unused,
                     },
-                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .each = .{ .once = &.{
                         .{ ._, ._, .call, .tmp0d, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fmaxq" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_dqa, .mov, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_dqa, .mov, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .v_dqa, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fmaxq" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._dqa, .mov, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._dqa, .mov, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._dqa, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fmaxq" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._ps, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 } }) catch |err| switch (err) {
                     error.SelectFailed => return cg.fail("failed to select {s} {} {} {}", .{
                         @tagName(air_tag),
@@ -5787,10 +6726,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 };
                 try res[0].finish(inst, &.{ bin_op.lhs, bin_op.rhs }, &ops, cg);
             },
-            .min => |air_tag| if (use_old) try cg.airBinOp(inst, air_tag) else fallback: {
+            .min => |air_tag| if (use_old) try cg.airBinOp(inst, air_tag) else {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
-                const ty = cg.typeOf(bin_op.lhs);
-                if (ty.isVector(zcu) and cg.floatBits(ty.childType(zcu)) != null) break :fallback try cg.airBinOp(inst, air_tag);
                 var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
                 var res: [1]Temp = undefined;
                 cg.select(&res, &.{cg.typeOf(bin_op.lhs)}, &ops, comptime &.{ .{
@@ -7494,7 +8431,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
+                        .{ .type = .vector_4_u32, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -7631,7 +8568,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
+                        .{ .type = .vector_4_u32, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .xword } } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u32, .kind = .{ .rc = .sse } },
@@ -7955,7 +8892,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -7986,7 +8923,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .reg = .xmm0 } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .unused,
@@ -8018,7 +8955,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
@@ -8047,7 +8984,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_4_u64, .kind = .{ .rc = .sse } },
@@ -8083,7 +9020,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
@@ -8119,7 +9056,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
+                        .{ .type = .u64, .kind = .{ .smin_mem = .{ .ref = .src0, .vectorize_to = .none } } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
                         .{ .type = .vector_2_u64, .kind = .{ .rc = .sse } },
@@ -8371,8 +9308,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .to_sse } },
                     },
                     .extra_temps = .{
-                        .{ .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
-                        .{ .kind = .{ .rc = .sse } },
+                        .{ .type = .f16, .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
+                        .{ .type = .f16, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -8383,12 +9320,12 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
-                        .{ ._, .v_ps, .cvtph2, .dst0x, .src0x, ._, ._ },
-                        .{ ._, .v_ps, .cvtph2, .tmp0x, .src1x, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp0x, .src1q, ._, ._ },
                         .{ ._, .v_ss, .cmp, .tmp1x, .dst0x, .dst0x, .vp(.unord) },
                         .{ ._, .v_ss, .min, .dst0x, .tmp0x, .dst0x, ._ },
                         .{ ._, .v_ps, .blendv, .dst0x, .dst0x, .tmp0x, .tmp1x },
-                        .{ ._, .v_, .cvtps2ph, .dst0x, .dst0x, .rm(.{}), ._ },
+                        .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
                     } },
                 }, .{
                     .required_features = .{ .sse, null, null, null },
@@ -8411,11 +9348,248 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .unused,
                         .unused,
                     },
-                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .each = .{ .once = &.{
                         .{ ._, ._, .call, .tmp0d, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .f16c, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .qword, .is = .word } },
+                        .{ .scalar_float = .{ .of = .qword, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .mem } },
+                        .{ .src = .{ .to_sse, .mem } },
+                        .{ .src = .{ .mem, .to_sse } },
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f16, .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
+                        .{ .type = .vector_4_f16, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp0x, .src1q, ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp1x, .dst0x, .dst0x, .vp(.unord) },
+                        .{ ._, .v_ps, .min, .dst0x, .tmp0x, .dst0x, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0x, .dst0x, .tmp0x, .tmp1x },
+                        .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .f16c, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .word } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .mem } },
+                        .{ .src = .{ .to_sse, .mem } },
+                        .{ .src = .{ .mem, .to_sse } },
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_8_f16, .kind = .{ .mut_rc = .{ .ref = .src1, .rc = .sse } } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp0y, .src1x, ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp1y, .dst0y, .dst0y, .vp(.unord) },
+                        .{ ._, .v_ps, .min, .dst0y, .tmp0y, .dst0y, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0y, .dst0y, .tmp0y, .tmp1y },
+                        .{ ._, .v_, .cvtps2ph, .dst0q, .dst0y, .rm(.{}), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .f16c, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f16, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_ps, .cvtph2, .tmp1y, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .cvtph2, .tmp2y, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp3y, .tmp1y, .tmp1y, .vp(.unord) },
+                        .{ ._, .v_ps, .min, .tmp1y, .tmp2y, .tmp1y, ._ },
+                        .{ ._, .v_ps, .blendv, .tmp1y, .tmp1y, .tmp2y, .tmp3y },
+                        .{ ._, .v_, .cvtps2ph, .memia(.dst0x, .tmp0, .add_size), .tmp1y, .rm(.{}), ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fminh" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .vp_, .xor, .tmp2x, .tmp2x, .tmp2x, ._ },
+                        .{ ._, .vp_w, .insr, .tmp1x, .tmp2x, .memia(.src0w, .tmp0, .add_size), .ui(0) },
+                        .{ ._, .vp_w, .insr, .tmp2x, .tmp2x, .memia(.src1w, .tmp0, .add_size), .ui(0) },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .vp_w, .extr, .memia(.dst0w, .tmp0, .add_size), .tmp1x, .ui(0), ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fminh" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
+                        .{ ._, .p_, .xor, .tmp2x, .tmp2x, ._, ._ },
+                        .{ ._, .p_w, .insr, .tmp1x, .memia(.src0w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, .p_w, .insr, .tmp2x, .memia(.src1w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .p_w, .extr, .memia(.dst0w, .tmp0, .add_size), .tmp1x, .ui(0), ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fminh" } } },
+                        .{ .type = .f16, .kind = .{ .reg = .ax } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ },
+                        .{ ._, .p_, .xor, .tmp2x, .tmp2x, ._, ._ },
+                        .{ ._, .p_w, .insr, .tmp1x, .memia(.src0w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, .p_w, .insr, .tmp2x, .memia(.src1w, .tmp0, .add_size), .ui(0), ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .p_w, .extr, .tmp4d, .tmp1x, .ui(0), ._ },
+                        .{ ._, ._, .mov, .memia(.dst0w, .tmp0, .add_size), .tmp4w, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                        .{ .multiple_scalar_float = .{ .of = .word, .is = .word } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f16, .kind = .{ .reg = .eax } },
+                        .{ .type = .f32, .kind = .mem },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f16, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__fminh" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._, .movzx, .tmp1d, .memia(.src0w, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ },
+                        .{ ._, ._ss, .mov, .tmp3x, .mem(.tmp2d), ._, ._ },
+                        .{ ._, ._, .movzx, .tmp1d, .memia(.src1w, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ },
+                        .{ ._, ._ss, .mov, .tmp4x, .mem(.tmp2d), ._, ._ },
+                        .{ ._, ._, .call, .tmp5d, ._, ._, ._ },
+                        .{ ._, ._ss, .mov, .mem(.tmp2d), .tmp3x, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .mem(.tmp2d), ._, ._ },
+                        .{ ._, ._, .mov, .memia(.dst0w, .tmp0, .add_size), .tmp1w, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(2), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{
@@ -8426,7 +9600,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .to_sse } },
                     },
                     .extra_temps = .{
-                        .{ .kind = .{ .rc = .sse } },
+                        .{ .type = .f32, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -8487,6 +9661,210 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._ps, .andn, .dst0x, .src1x, ._, ._ },
                         .{ ._, ._ps, .@"or", .dst0x, .tmp0x, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cmp, .tmp0x, .src0x, .src0x, .vp(.unord) },
+                        .{ ._, .v_ps, .min, .dst0x, .src1x, .src0x, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0x, .dst0x, .src1x, .tmp0x },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .mem } },
+                        .{ .src = .{ .mem, .{ .to_reg = .xmm0 } }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .to_sse } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._ps, .mova, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._ps, .min, .dst0x, .src0x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .src0x, .src0x, .vp(.unord), ._ },
+                        .{ ._, ._ps, .blendv, .dst0x, .src1x, .src0x, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .mem } },
+                        .{ .src = .{ .mem, .to_mut_sse }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._ps, .mova, .tmp0x, .src1x, ._, ._ },
+                        .{ ._, ._ps, .min, .tmp0x, .src0x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .dst0x, .src0x, .vp(.ord), ._ },
+                        .{ ._, ._ps, .@"and", .tmp0x, .dst0x, ._, ._ },
+                        .{ ._, ._ps, .andn, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._ps, .@"or", .dst0x, .tmp0x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .yword, .is = .dword } },
+                        .{ .scalar_float = .{ .of = .yword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ps, .cmp, .tmp0y, .src0y, .src0y, .vp(.unord) },
+                        .{ ._, .v_ps, .min, .dst0y, .src1y, .src0y, ._ },
+                        .{ ._, .v_ps, .blendv, .dst0y, .dst0y, .src1y, .tmp0y },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } },
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_ps, .mova, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .mova, .tmp2y, .memia(.src1y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_ps, .cmp, .tmp3y, .tmp1y, .tmp1y, .vp(.unord) },
+                        .{ ._, .v_ps, .min, .tmp1y, .tmp2y, .tmp1y, ._ },
+                        .{ ._, .v_ps, .blendv, .tmp1y, .tmp1y, .tmp2y, .tmp3y },
+                        .{ ._, .v_ps, .mova, .memia(.dst0y, .tmp0, .add_size), .tmp1y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_4_f32, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .min, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .tmp1x, .tmp1x, .vp(.unord), ._ },
+                        .{ ._, ._ps, .blendv, .tmp3x, .tmp2x, .tmp1x, ._ },
+                        .{ ._, ._ps, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp3x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .min, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .cmp, .tmp1x, .tmp1x, .vp(.ord), ._ },
+                        .{ ._, ._ps, .@"and", .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .andn, .tmp1x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .@"or", .tmp1x, .tmp3x, ._, ._ },
+                        .{ ._, ._ps, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{
@@ -8497,7 +9875,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .to_sse, .to_sse } },
                     },
                     .extra_temps = .{
-                        .{ .kind = .{ .rc = .sse } },
+                        .{ .type = .f64, .kind = .{ .rc = .sse } },
                         .unused,
                         .unused,
                         .unused,
@@ -8579,11 +9957,249 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .unused,
                         .unused,
                     },
-                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .each = .{ .once = &.{
                         .{ ._, ._, .call, .tmp0d, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_pd, .cmp, .tmp0x, .src0x, .src0x, .vp(.unord) },
+                        .{ ._, .v_pd, .min, .dst0x, .src1x, .src0x, ._ },
+                        .{ ._, .v_pd, .blendv, .dst0x, .dst0x, .src1x, .tmp0x },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .mem } },
+                        .{ .src = .{ .mem, .{ .to_reg = .xmm0 } }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .{ .to_reg = .xmm0 }, .to_sse } },
+                    },
+                    .dst_temps = .{.{ .rc = .sse }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._pd, .mova, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._pd, .min, .dst0x, .src0x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .src0x, .src0x, .vp(.unord), ._ },
+                        .{ ._, ._pd, .blendv, .dst0x, .src1x, .src0x, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .mem } },
+                        .{ .src = .{ .mem, .to_mut_sse }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._pd, .mova, .tmp0x, .src1x, ._, ._ },
+                        .{ ._, ._pd, .min, .tmp0x, .src0x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .dst0x, .src0x, .vp(.ord), ._ },
+                        .{ ._, ._pd, .@"and", .tmp0x, .dst0x, ._, ._ },
+                        .{ ._, ._pd, .andn, .dst0x, .src1x, ._, ._ },
+                        .{ ._, ._pd, .@"or", .dst0x, .tmp0x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .scalar_float = .{ .of = .yword, .is = .qword } },
+                        .{ .scalar_float = .{ .of = .yword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_rc = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, .v_pd, .cmp, .tmp0y, .src0y, .src0y, .vp(.unord) },
+                        .{ ._, .v_pd, .min, .dst0y, .src1y, .src0y, ._ },
+                        .{ ._, .v_pd, .blendv, .dst0y, .dst0y, .src1y, .tmp0y },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_pd, .mova, .tmp1y, .memia(.src0y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_pd, .mova, .tmp2y, .memia(.src1y, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_pd, .cmp, .tmp3y, .tmp1y, .tmp1y, .vp(.unord) },
+                        .{ ._, .v_pd, .min, .tmp1y, .tmp2y, .tmp1y, ._ },
+                        .{ ._, .v_pd, .blendv, .tmp1y, .tmp1y, .tmp2y, .tmp3y },
+                        .{ ._, .v_pd, .mova, .memia(.dst0y, .tmp0, .add_size), .tmp1y, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(32), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_2_f64, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._pd, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._pd, .min, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .tmp1x, .tmp1x, .vp(.unord), ._ },
+                        .{ ._, ._pd, .blendv, .tmp3x, .tmp2x, .tmp1x, ._ },
+                        .{ ._, ._pd, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp3x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._pd, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._pd, .mova, .tmp3x, .tmp2x, ._, ._ },
+                        .{ ._, ._pd, .min, .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._pd, .cmp, .tmp1x, .tmp1x, .vp(.ord), ._ },
+                        .{ ._, ._pd, .@"and", .tmp3x, .tmp1x, ._, ._ },
+                        .{ ._, ._pd, .andn, .tmp1x, .tmp2x, ._, ._ },
+                        .{ ._, ._pd, .@"or", .tmp1x, .tmp3x, ._, ._ },
+                        .{ ._, ._pd, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } },
+                        .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f64, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f64, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fmin" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .xor, .tmp1x, .tmp1x, ._, ._ },
+                        .{ ._, ._ps, .xor, .tmp2x, .tmp2x, ._, ._ },
+                        .{ ._, ._ps, .movl, .tmp1x, .memia(.src0q, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .movl, .tmp2x, .memia(.src1q, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._ps, .movl, .memia(.dst0q, .tmp0, .add_size), .tmp1q, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .x87, .cmov, null, null },
                     .src_constraints = .{
@@ -8730,6 +10346,164 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, .f_, .ld, .src1t, ._, ._, ._ },
                         .{ .@"1:", .f_p, .st, .dst0t, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .x87, .cmov, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ucomi, .tmp1t, .tmp1t, ._, ._ },
+                        .{ ._, .f_u, .cmov, .tmp1t, .tmp2t, ._, ._ },
+                        .{ ._, .f_, .ucomi, .tmp1t, .tmp2t, ._, ._ },
+                        .{ ._, .f_nb, .cmov, .tmp1t, .tmp2t, ._, ._ },
+                        .{ ._, .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sahf, .x87, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .{ .type = .u8, .kind = .{ .reg = .ah } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp1t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._p, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp2t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._b, .j, .@"2f", ._, ._, ._ },
+                        .{ .@"1:", .f_p, .st, .tmp1t, ._, ._, ._ },
+                        .{ ._, .f_, .ld, .tmp2t, ._, ._, ._ },
+                        .{ .@"2:", .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .x87, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .{ .type = .u8, .kind = .{ .reg = .ah } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .xam, ._, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .@"test", .tmp3b, .si(0b0_1_000_100), ._, ._ },
+                        .{ ._, ._z, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp2t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .@"test", .tmp3b, .si(0b0_0_000_001), ._, ._ },
+                        .{ ._, ._nz, .j, .@"2f", ._, ._, ._ },
+                        .{ .@"1:", .f_p, .st, .tmp1t, ._, ._, ._ },
+                        .{ ._, .f_, .ld, .tmp2t, ._, ._, ._ },
+                        .{ .@"2:", .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .x87, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .tbyte } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f80, .kind = .{ .reg = .st6 } },
+                        .{ .type = .f80, .kind = .{ .reg = .st7 } },
+                        .{ .type = .u8, .kind = .{ .reg = .ah } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0p, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .f_, .ld, .memia(.src1t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ld, .memia(.src0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp1t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._p, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, .f_, .ucom, .tmp2t, ._, ._, ._ },
+                        .{ ._, .fn_sw, .st, .tmp3w, ._, ._, ._ },
+                        .{ ._, ._, .sahf, ._, ._, ._, ._ },
+                        .{ ._, ._b, .j, .@"2f", ._, ._, ._ },
+                        .{ .@"1:", .f_p, .st, .tmp1t, ._, ._, ._ },
+                        .{ ._, .f_, .ld, .tmp2t, ._, ._, ._ },
+                        .{ .@"2:", .f_p, .st, .memia(.dst0t, .tmp0, .add_size), ._, ._, ._ },
+                        .{ ._, .f_p, .st, .tmp2t, ._, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .sse, null, null, null },
                     .src_constraints = .{
@@ -8751,11 +10525,107 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .unused,
                         .unused,
                     },
-                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .dst_temps = .{.{ .ref = .src0 }},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
                     .each = .{ .once = &.{
                         .{ ._, ._, .call, .tmp0d, ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fminq" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", .v_dqa, .mov, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, .v_dqa, .mov, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, .v_dqa, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fminq" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._dqa, .mov, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._dqa, .mov, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._dqa, .mov, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                        .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } },
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem } },
+                    },
+                    .call_frame = .{ .alignment = .@"16" },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm0 } },
+                        .{ .type = .f128, .kind = .{ .reg = .xmm1 } },
+                        .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "fminq" } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.mem},
+                    .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .sa(.src0, .sub_size), ._, ._ },
+                        .{ .@"0:", ._ps, .mova, .tmp1x, .memia(.src0x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._ps, .mova, .tmp2x, .memia(.src1x, .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
+                        .{ ._, ._ps, .mova, .memia(.dst0x, .tmp0, .add_size), .tmp1x, ._, ._ },
+                        .{ ._, ._, .add, .tmp0q, .si(16), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
                 } }) catch |err| switch (err) {
                     error.SelectFailed => return cg.fail("failed to select {s} {} {} {}", .{
                         @tagName(air_tag),
@@ -8955,7 +10825,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         },
                         .extra_temps = .{
                             .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                            .{ .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
                             .unused,
@@ -8982,7 +10852,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         },
                         .extra_temps = .{
                             .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                            .{ .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
                             .unused,
@@ -9009,7 +10879,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         },
                         .extra_temps = .{
                             .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                            .{ .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
                             .unused,
@@ -9036,7 +10906,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         },
                         .extra_temps = .{
                             .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                            .{ .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
                             .unused,
@@ -9063,7 +10933,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         },
                         .extra_temps = .{
                             .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
-                            .{ .kind = .{ .rc = .sse } },
+                            .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } },
                             .unused,
                             .unused,
                             .unused,
@@ -15226,8 +17096,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .info = .{ .kind = .all, .scalar = .dword },
                             } }},
                             .each = .{ .once = &.{
-                                .{ ._, .v_ps, .cvtph2, .dst0x, .src0x, ._, ._ },
-                                .{ ._, .v_ps, .cvtph2, .tmp0x, .src1x, ._, ._ },
+                                .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ },
+                                .{ ._, .v_ps, .cvtph2, .tmp0x, .src1q, ._, ._ },
                                 .{ ._, .v_ss, .cmp, .dst0x, .dst0x, .tmp0x, .vp(switch (cc) {
                                     else => unreachable,
                                     .e => .eq,
@@ -15815,7 +17685,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .i32, .kind = .{ .reg = .eax } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .reg = .edx } },
-                                .{ .type = .vector_8_f16, .kind = .mem },
+                                .{ .type = .f32, .kind = .mem },
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
@@ -15825,10 +17695,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
                                 .{ .@"0:", ._, .movzx, .tmp4d, .memsi(.src0w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp7d), .tmp4d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp1x, .mem(.tmp7x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp1x, .mem(.tmp7d), ._, ._ },
                                 .{ ._, ._, .movzx, .tmp4d, .memsi(.src1w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp7d), .tmp4d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp2x, .mem(.tmp7x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp2x, .mem(.tmp7d), ._, ._ },
                                 .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
                                 .{ ._, ._, .xor, .tmp6d, .tmp6d, ._, ._ },
                                 .{ ._, ._, .@"test", .tmp4d, .tmp4d, ._, ._ },
@@ -15863,7 +17733,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .i32, .kind = .{ .reg = .eax } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u32, .kind = .{ .reg = .edx } },
-                                .{ .type = .vector_8_f16, .kind = .mem },
+                                .{ .type = .f32, .kind = .mem },
                                 .unused,
                             },
                             .dst_temps = .{.{ .rc = .general_purpose }},
@@ -15873,10 +17743,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .xor, .tmp0d, .tmp0d, ._, ._ },
                                 .{ .@"0:", ._, .movzx, .tmp4d, .memsi(.src0w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp7d), .tmp4d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp1x, .mem(.tmp7x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp1x, .mem(.tmp7d), ._, ._ },
                                 .{ ._, ._, .movzx, .tmp4d, .memsi(.src1w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp7d), .tmp4d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp2x, .mem(.tmp7x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp2x, .mem(.tmp7d), ._, ._ },
                                 .{ ._, ._, .call, .tmp3d, ._, ._, ._ },
                                 .{ ._, ._, .xor, .tmp6d, .tmp6d, ._, ._ },
                                 .{ ._, ._, .@"test", .tmp4d, .tmp4d, ._, ._ },
@@ -16133,7 +18003,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .i32, .kind = .{ .reg = .eax } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u64, .kind = .{ .reg = .rdx } },
-                                .{ .type = .vector_8_f16, .kind = .mem },
+                                .{ .type = .f32, .kind = .mem },
                             },
                             .dst_temps = .{.mem},
                             .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
@@ -16142,10 +18012,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                 .{ .@"0:", ._, .movzx, .tmp5d, .memsi(.src0w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp8d), .tmp5d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp2x, .mem(.tmp8x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp2x, .mem(.tmp8d), ._, ._ },
                                 .{ ._, ._, .movzx, .tmp5d, .memsi(.src1w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp8d), .tmp5d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp3x, .mem(.tmp8x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp3x, .mem(.tmp8d), ._, ._ },
                                 .{ ._, ._, .call, .tmp4d, ._, ._, ._ },
                                 .{ ._, ._, .xor, .tmp7d, .tmp7d, ._, ._ },
                                 .{ ._, ._, .@"test", .tmp5d, .tmp5d, ._, ._ },
@@ -16191,7 +18061,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ .type = .i32, .kind = .{ .reg = .eax } },
                                 .{ .type = .u8, .kind = .{ .reg = .cl } },
                                 .{ .type = .u64, .kind = .{ .reg = .rdx } },
-                                .{ .type = .vector_8_f16, .kind = .mem },
+                                .{ .type = .f32, .kind = .mem },
                             },
                             .dst_temps = .{.mem},
                             .clobbers = .{ .eflags = true, .caller_preserved = .ccc },
@@ -16200,10 +18070,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .xor, .tmp1d, .tmp1d, ._, ._ },
                                 .{ .@"0:", ._, .movzx, .tmp5d, .memsi(.src0w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp8d), .tmp5d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp2x, .mem(.tmp8x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp2x, .mem(.tmp8d), ._, ._ },
                                 .{ ._, ._, .movzx, .tmp5d, .memsi(.src1w, .@"2", .tmp0), ._, ._ },
                                 .{ ._, ._, .mov, .mem(.tmp8d), .tmp5d, ._, ._ },
-                                .{ ._, ._ps, .mova, .tmp3x, .mem(.tmp8x), ._, ._ },
+                                .{ ._, ._ss, .mov, .tmp3x, .mem(.tmp8d), ._, ._ },
                                 .{ ._, ._, .call, .tmp4d, ._, ._, ._ },
                                 .{ ._, ._, .xor, .tmp7d, .tmp7d, ._, ._ },
                                 .{ ._, ._, .@"test", .tmp5d, .tmp5d, ._, ._ },
@@ -16482,7 +18352,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .@"or", .tmp4b, .tmp5b, ._, ._ },
                                 .{ ._, ._, .mov, .lea(.byte, .tmp1), .tmp4b, ._, ._ },
                                 .{ ._, ._, .lea, .tmp1p, .lead(.none, .tmp1, 1), ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(64), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
@@ -16528,7 +18398,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 .{ ._, ._, .@"or", .tmp4b, .tmp5b, ._, ._ },
                                 .{ ._, ._, .mov, .lea(.byte, .tmp1), .tmp4b, ._, ._ },
                                 .{ ._, ._c, .in, .tmp1q, ._, ._, ._ },
-                                .{ ._, ._, .add, .tmp0p, .si(32), ._, ._ },
+                                .{ ._, ._, .add, .tmp0p, .si(64), ._, ._ },
                                 .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                             } },
                         }, .{
@@ -19809,8 +21679,8 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         } }},
                         .clobbers = .{ .eflags = true },
                         .each = .{ .once = &.{
-                            .{ ._, .v_ps, .cvtph2, .tmp0x, .src0x, ._, ._ },
-                            .{ ._, .v_ps, .cvtph2, .tmp1x, .src1x, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp0x, .src0q, ._, ._ },
+                            .{ ._, .v_ps, .cvtph2, .tmp1x, .src1q, ._, ._ },
                             .{ ._, .v_ss, .ucomi, .tmp0x, .tmp1x, ._, ._ },
                         } },
                     }, .{
@@ -21194,19 +23064,21 @@ fn allocRegOrMemAdvanced(self: *CodeGen, ty: Type, inst: ?Air.Inst.Index, reg_ok
     };
 
     if (reg_ok) need_mem: {
-        if (std.math.isPowerOfTwo(abi_size) and abi_size <= @as(u32, switch (ty.zigTypeTag(zcu)) {
+        if (std.math.isPowerOfTwo(abi_size) and abi_size <= @as(u32, max_abi_size: switch (ty.zigTypeTag(zcu)) {
             .float => switch (ty.floatBits(self.target.*)) {
                 16, 32, 64, 128 => 16,
                 80 => break :need_mem,
                 else => unreachable,
             },
-            .vector => switch (ty.childType(zcu).zigTypeTag(zcu)) {
-                .float => switch (ty.childType(zcu).floatBits(self.target.*)) {
+            .vector => {
+                const elem_ty = ty.childType(zcu);
+                break :max_abi_size if (elem_ty.toIntern() == .bool_type)
+                    8
+                else if (self.floatBits(elem_ty)) |float_bits| switch (float_bits) {
                     16, 32, 64, 128 => self.vectorSize(.float),
                     80 => break :need_mem,
                     else => unreachable,
-                },
-                else => self.vectorSize(.int),
+                } else self.vectorSize(.int);
             },
             else => 8,
         })) {
@@ -21223,17 +23095,18 @@ fn allocRegOrMemAdvanced(self: *CodeGen, ty: Type, inst: ?Air.Inst.Index, reg_ok
 fn regClassForType(self: *CodeGen, ty: Type) Register.Class {
     const pt = self.pt;
     const zcu = pt.zcu;
-    return switch (ty.zigTypeTag(zcu)) {
-        .float => switch (ty.floatBits(self.target.*)) {
-            80 => .x87,
-            else => .sse,
-        },
-        .vector => switch (ty.childType(zcu).toIntern()) {
-            .bool_type => .general_purpose,
-            else => .sse,
-        },
-        else => .general_purpose,
+    if (self.floatBits(ty)) |float_bits| return switch (float_bits) {
+        80 => .x87,
+        else => .sse,
     };
+    if (!ty.isVector(zcu)) return .general_purpose;
+    const elem_ty = ty.childType(zcu);
+    return if (elem_ty.toIntern() == .bool_type)
+        .general_purpose
+    else if (self.floatBits(elem_ty) == 80)
+        .x87
+    else
+        .sse;
 }
 
 fn regSetForRegClass(rc: Register.Class) RegisterManager.RegisterBitSet {
@@ -33370,10 +35243,21 @@ const MoveStrategy = union(enum) {
                 else => dst_reg,
                 .lea => if (dst_reg.bitSize() >= 32) dst_reg else dst_reg.to32(),
             }, src_mem),
-            .x87_load_store => {
+            .x87_load_store => if (dst_reg != .st0 and self.register_manager.isKnownRegFree(.st7)) {
+                try self.asmMemory(.{ .f_, .ld }, src_mem);
+                switch (dst_reg) {
+                    .st1, .st2, .st3, .st4, .st5, .st6 => try self.asmRegister(.{ .f_p, .st }, @enumFromInt(@intFromEnum(dst_reg) + 1)),
+                    .st7 => try self.asmOpOnly(.{ .f_cstp, .in }),
+                    else => unreachable,
+                }
+            } else {
+                try self.asmRegister(.{ .f_p, .st }, dst_reg);
                 try self.asmMemory(.{ .f_, .ld }, src_mem);
-                assert(dst_reg != .st7);
-                try self.asmRegister(.{ .f_p, .st }, @enumFromInt(@intFromEnum(dst_reg) + 1));
+                switch (dst_reg) {
+                    .st0 => {},
+                    .st1, .st2, .st3, .st4, .st5, .st6, .st7 => try self.asmRegister(.{ .f_, .xch }, dst_reg),
+                    else => unreachable,
+                }
             },
             .insert_extract => |ie| if (ie.insert[0] != .p_w or self.hasFeature(.sse2))
                 try self.asmRegisterMemoryImmediate(ie.insert, dst_reg, src_mem, .u(0))
@@ -33405,9 +35289,22 @@ const MoveStrategy = union(enum) {
     pub fn write(strat: MoveStrategy, self: *CodeGen, dst_mem: Memory, src_reg: Register) !void {
         switch (strat) {
             .move => |tag| try self.asmMemoryRegister(tag, dst_mem, src_reg),
-            .x87_load_store => {
+            .x87_load_store => if (self.register_manager.isKnownRegFree(.st7)) {
                 try self.asmRegister(.{ .f_, .ld }, src_reg);
                 try self.asmMemory(.{ .f_p, .st }, dst_mem);
+            } else {
+                switch (src_reg) {
+                    .st0 => {},
+                    .st1, .st2, .st3, .st4, .st5, .st6, .st7 => try self.asmRegister(.{ .f_, .xch }, src_reg),
+                    else => unreachable,
+                }
+                try self.asmMemory(.{ .f_p, .st }, dst_mem);
+                try self.asmMemory(.{ .f_, .ld }, dst_mem);
+                switch (src_reg) {
+                    .st0 => {},
+                    .st1, .st2, .st3, .st4, .st5, .st6, .st7 => try self.asmRegister(.{ .f_, .xch }, src_reg),
+                    else => unreachable,
+                }
             },
             .insert_extract, .vex_insert_extract => |ie| if (ie.extract[0] != .p_w or self.hasFeature(.sse4_1))
                 try self.asmMemoryRegisterImmediate(ie.extract, dst_mem, src_reg, .u(0))
@@ -33964,10 +35861,25 @@ fn genSetReg(
                 .general_purpose, .segment => unreachable,
                 .x87 => switch (src_reg) {
                     .st0 => try self.asmRegister(.{ .f_, .st }, dst_reg),
-                    .st1, .st2, .st3, .st4, .st5, .st6 => {
-                        try self.asmRegister(.{ .f_, .ld }, src_reg);
-                        assert(dst_reg != .st7);
-                        try self.asmRegister(.{ .f_p, .st }, @enumFromInt(@intFromEnum(dst_reg) + 1));
+                    .st1, .st2, .st3, .st4, .st5, .st6 => switch (dst_reg) {
+                        .st0 => {
+                            try self.asmRegister(.{ .f_p, .st }, .st0);
+                            try self.asmRegister(.{ .f_, .ld }, @enumFromInt(@intFromEnum(src_reg) - 1));
+                        },
+                        .st2, .st3, .st4, .st5, .st6 => if (self.register_manager.isKnownRegFree(.st7)) {
+                            try self.asmRegister(.{ .f_, .ld }, src_reg);
+                            try self.asmRegister(.{ .f_p, .st }, @enumFromInt(@intFromEnum(dst_reg) + 1));
+                        } else {
+                            try self.asmRegister(.{ .f_, .xch }, src_reg);
+                            try self.asmRegister(.{ .f_, .xch }, dst_reg);
+                            try self.asmRegister(.{ .f_, .xch }, src_reg);
+                        },
+                        .st7 => {
+                            if (!self.register_manager.isKnownRegFree(.st7)) try self.asmRegister(.{ .f_, .free }, dst_reg);
+                            try self.asmRegister(.{ .f_, .ld }, src_reg);
+                            try self.asmOpOnly(.{ .f_cstp, .in });
+                        },
+                        else => unreachable,
                     },
                     else => unreachable,
                 },
@@ -33993,10 +35905,14 @@ fn genSetReg(
                         .base = .{ .frame = frame_index },
                         .mod = .{ .rm = .{ .size = .fromSize(abi_size) } },
                     }, registerAlias(src_reg, abi_size));
+                    switch (frame_size) {
+                        else => {},
+                        8 => try self.asmRegisterRegister(.{ ._ps, .xor }, dst_reg.to128(), dst_reg.to128()),
+                    }
                     try self.asmRegisterMemory(switch (frame_size) {
                         4 => .{ ._ss, .mov },
                         8 => .{ ._ps, .movl },
-                        16 => .{ ._ps, .mov },
+                        16 => .{ ._ps, .mova },
                         else => unreachable,
                     }, dst_reg.to128(), .{
                         .base = .{ .frame = frame_index },
@@ -34009,7 +35925,26 @@ fn genSetReg(
                     .{ .register = try self.copyToTmpRegister(ty, src_mcv) },
                     opts,
                 ),
-                .x87, .mmx, .ip, .cr, .dr => unreachable,
+                .x87 => {
+                    const frame_index = try self.allocFrameIndex(.init(.{
+                        .size = 16,
+                        .alignment = .@"16",
+                    }));
+                    try MoveStrategy.write(.x87_load_store, self, .{
+                        .base = .{ .frame = frame_index },
+                        .mod = .{ .rm = .{ .size = .tbyte } },
+                    }, src_reg);
+                    try self.asmRegisterMemory(if (self.hasFeature(.avx))
+                        .{ .v_dqa, .mov }
+                    else if (self.hasFeature(.sse2))
+                        .{ ._dqa, .mov }
+                    else
+                        .{ ._ps, .mova }, dst_reg.to128(), .{
+                        .base = .{ .frame = frame_index },
+                        .mod = .{ .rm = .{ .size = .xword } },
+                    });
+                },
+                .mmx, .ip, .cr, .dr => unreachable,
                 .sse => try self.asmRegisterRegister(
                     @as(?Mir.Inst.FixedTag, switch (ty.scalarType(zcu).zigTypeTag(zcu)) {
                         else => switch (abi_size) {
@@ -38510,7 +40445,7 @@ fn resolveCallingConventionValues(
                     else if (ret_gpr.len >= 2 and ret_ty.isSliceAtRuntime(zcu))
                         break :return_value .init(.{ .register_pair = ret_gpr[0..2].* }),
                     .segment, .mmx, .ip, .cr, .dr => unreachable,
-                    .x87 => break :return_value .init(.{ .register = .st0 }),
+                    .x87 => if (ret_size <= 16) break :return_value .init(.{ .register = .st0 }),
                     .sse => if (ret_size <= self.vectorSize(.float)) break :return_value .init(.{
                         .register = registerAlias(abi.getCAbiSseReturnRegs(cc)[0], @max(ret_size, 16)),
                     }),
@@ -38545,7 +40480,7 @@ fn resolveCallingConventionValues(
                         continue;
                     },
                     .segment, .mmx, .ip, .cr, .dr => unreachable,
-                    .x87 => if (param_x87.len >= 1) {
+                    .x87 => if (param_x87.len >= 1 and param_size <= 16) {
                         arg.* = .{ .register = param_x87[0] };
                         param_x87 = param_x87[1..];
                         continue;
@@ -38656,10 +40591,12 @@ fn registerAlias(reg: Register, size_bytes: u32) Register {
 
 fn memSize(self: *CodeGen, ty: Type) Memory.Size {
     const zcu = self.pt.zcu;
-    return switch (ty.zigTypeTag(zcu)) {
-        .float => .fromBitSize(ty.floatBits(self.target.*)),
-        else => .fromSize(@intCast(ty.abiSize(zcu))),
-    };
+    return if (self.floatBits(ty)) |float_bits|
+        .fromBitSize(float_bits)
+    else if (ty.isVector(zcu) and ty.vectorLen(zcu) == 1 and self.floatBits(ty.childType(zcu)) == 80)
+        .tbyte
+    else
+        .fromSize(@intCast(ty.abiSize(zcu)));
 }
 
 fn splitType(self: *CodeGen, comptime parts_len: usize, ty: Type) ![parts_len]Type {
@@ -40762,7 +42699,7 @@ const Select = struct {
         Select.Operand,
         Select.Operand,
     };
-    const Label = enum { @"0:", @"1:", @"_" };
+    const Label = enum { @"0:", @"1:", @"2:", @"_" };
     const Operand = struct {
         tag: Tag,
         base: Ref.Sized = .none,
@@ -40992,6 +42929,8 @@ const Select = struct {
         const @"0f": Select.Operand = .{ .tag = .forward_label, .base = .{ .ref = .tmp0, .size = .none } };
         const @"1b": Select.Operand = .{ .tag = .backward_label, .base = .{ .ref = .tmp1, .size = .none } };
         const @"1f": Select.Operand = .{ .tag = .forward_label, .base = .{ .ref = .tmp1, .size = .none } };
+        const @"2b": Select.Operand = .{ .tag = .backward_label, .base = .{ .ref = .tmp2, .size = .none } };
+        const @"2f": Select.Operand = .{ .tag = .forward_label, .base = .{ .ref = .tmp2, .size = .none } };
 
         const tmp0b: Select.Operand = .{ .tag = .ref, .base = .tmp0b };
         const tmp0w: Select.Operand = .{ .tag = .ref, .base = .tmp0w };
test/behavior/x86_64/math.zig
@@ -3781,6 +3781,396 @@ fn binary(comptime op: anytype, comptime opts: struct { strict: bool = false })
                 0xf1e3bbe031d59351770a7a501b6e969b2c00d144f17648db3f944b69dfeb7be72e5ff933a061eba4eaa422f8ca09e5a97d0b0dd740fd4076eba8c72d7a278523f399202dc2d043c4e0eb58a2bcd4066e2146e321810b1ee4d3afdddb4f026bcc7905ce17e033a7727b4e08f33b53c63d8c9f763fc6c31d0523eb38c30d5e40bc,
             });
         }
+        fn testFloatVectorTypes() !void {
+            @setEvalBranchQuota(21_700);
+
+            try testArgs(@Vector(1, f16), .{
+                -tmin(f16),
+            }, .{
+                fmax(f16),
+            });
+            try testArgs(@Vector(2, f16), .{
+                0.1, 1.0,
+            }, .{
+                -nan(f16), -fmin(f16),
+            });
+            try testArgs(@Vector(4, f16), .{
+                0.1, -fmax(f16), 0.0, 0.1,
+            }, .{
+                -fmin(f16), -10.0, 1.0, -tmin(f16),
+            });
+            try testArgs(@Vector(8, f16), .{
+                -fmax(f16), -fmin(f16), -nan(f16), -0.0, tmin(f16), -0.0, 0.0, 0.1,
+            }, .{
+                -1.0, tmin(f16), nan(f16), nan(f16), -fmax(f16), -10.0, -nan(f16), 10.0,
+            });
+            try testArgs(@Vector(16, f16), .{
+                0.1, fmax(f16), -10.0, fmax(f16), -10.0, 0.1, -tmin(f16), -inf(f16), -tmin(f16), -1.0, -fmin(f16), tmin(f16), 10.0, -fmax(f16), 0.0, -fmin(f16),
+            }, .{
+                inf(f16), -10.0, -fmax(f16), fmax(f16), -tmin(f16), 0.0, -1.0, -1.0, 0.1, -nan(f16), -tmin(f16), 1.0, 0.1, fmax(f16), -0.0, inf(f16),
+            });
+            try testArgs(@Vector(32, f16), .{
+                -inf(f16), tmin(f16), fmin(f16), -nan(f16),  nan(f16),  0.1,      0.0,        10.0, -tmin(f16), inf(f16), 1.0,       -10.0, fmin(f16),  -0.0, 1.0,      -fmax(f16),
+                10.0,      -0.0,      -10.0,     -tmin(f16), fmax(f16), nan(f16), -fmin(f16), -1.0, 0.0,        -10.0,    -nan(f16), 1.0,   -tmin(f16), -0.0, nan(f16), 10.0,
+            }, .{
+                0.0,      10.0, -nan(f16), -0.0, tmin(f16),  fmax(f16), nan(f16),  tmin(f16), -10.0,      0.1,       10.0, fmin(f16), -fmax(f16), inf(f16),   inf(f16),   -tmin(f16),
+                inf(f16), -0.0, 0.1,       0.0,  -fmin(f16), -0.0,      -nan(f16), -inf(f16), -fmin(f16), fmax(f16), 1.0,  fmin(f16), -0.0,       -tmin(f16), -fmax(f16), -10.0,
+            });
+            try testArgs(@Vector(64, f16), .{
+                -nan(f16), fmin(f16),  -inf(f16),  inf(f16),  -tmin(f16), inf(f16),   0.1,       -1.0,      -inf(f16), nan(f16),  -fmin(f16), 0.1,      -tmin(f16), -fmax(f16), -10.0,    inf(f16),
+                0.0,       -fmin(f16), -fmax(f16), 10.0,      -fmax(f16), fmax(f16),  10.0,      fmin(f16), -inf(f16), -nan(f16), -tmin(f16), nan(f16), -0.0,       0.0,        0.1,      -fmin(f16),
+                0.0,       nan(f16),   inf(f16),   fmax(f16), nan(f16),   tmin(f16),  1.0,       tmin(f16), fmin(f16), -10.0,     0.0,        0.1,      inf(f16),   -10.0,      inf(f16), 1.0,
+                0.1,       -inf(f16),  10.0,       -0.0,      -1.0,       -tmin(f16), -nan(f16), 0.1,       0.1,       -nan(f16), -0.0,       -10.0,    -0.0,       -nan(f16),  0.1,      fmin(f16),
+            }, .{
+                10.0,       0.0,       fmax(f16), -inf(f16),  -fmax(f16), -fmax(f16), tmin(f16), -1.0,       -tmin(f16), -10.0, nan(f16), -nan(f16), tmin(f16),  -fmin(f16), nan(f16), -10.0,
+                10.0,       fmax(f16), 0.1,       0.0,        0.1,        -fmax(f16), -0.0,      -fmin(f16), inf(f16),   -1.0,  inf(f16), fmin(f16), -inf(f16),  -tmin(f16), 10.0,     10.0,
+                0.1,        0.1,       0.1,       10.0,       -fmin(f16), inf(f16),   0.1,       fmax(f16),  inf(f16),   -0.0,  -10.0,    tmin(f16), -fmin(f16), 0.0,        10.0,     0.0,
+                -tmin(f16), -inf(f16), 1.0,       -fmax(f16), inf(f16),   10.0,       fmax(f16), -1.0,       0.0,        0.1,   -1.0,     -inf(f16), 0.1,        0.0,        -10.0,    fmax(f16),
+            });
+            try testArgs(@Vector(128, f16), .{
+                -fmin(f16), 1.0,        0.0,       0.1,       nan(f16),   0.1,        0.1,       -inf(f16),  -tmin(f16), 1.0,        -fmin(f16), -fmax(f16), -1.0,      -fmin(f16), 10.0,       -nan(f16),
+                inf(f16),   -inf(f16),  tmin(f16), -10.0,     -1.0,       -0.0,       -0.0,      1.0,        nan(f16),   -10.0,      fmin(f16),  -tmin(f16), tmin(f16), 0.1,        -fmax(f16), fmax(f16),
+                tmin(f16),  -fmin(f16), nan(f16),  10.0,      1.0,        -fmin(f16), 0.1,       10.0,       fmax(f16),  fmax(f16),  fmax(f16),  -1.0,       -nan(f16), 10.0,       tmin(f16),  -nan(f16),
+                -nan(f16),  -inf(f16),  -0.0,      -inf(f16), nan(f16),   -1.0,       0.1,       -fmax(f16), -10.0,      nan(f16),   1.0,        -10.0,      tmin(f16), 1.0,        0.1,        1.0,
+                10.0,       0.1,        tmin(f16), nan(f16),  -inf(f16),  -1.0,       -1.0,      -fmax(f16), -inf(f16),  0.1,        0.1,        -0.0,       10.0,      fmin(f16),  -1.0,       inf(f16),
+                0.1,        -10.0,      inf(f16),  -0.0,      0.1,        0.0,        inf(f16),  1.0,        tmin(f16),  -tmin(f16), 0.1,        inf(f16),   tmin(f16), -inf(f16),  10.0,       1.0,
+                -inf(f16),  0.1,        1.0,       fmax(f16), -fmin(f16), nan(f16),   -nan(f16), fmin(f16),  -1.0,       -fmax(f16), inf(f16),   -fmax(f16), 0.0,       -10.0,      fmin(f16),  -fmax(f16),
+                -0.0,       -1.0,       0.1,       10.0,      inf(f16),   fmax(f16),  inf(f16),  10.0,       fmax(f16),  -0.0,       -tmin(f16), fmin(f16),  inf(f16),  nan(f16),   -fmin(f16), -1.0,
+            }, .{
+                -fmax(f16), fmax(f16),  inf(f16),  1.0,        nan(f16),  0.1,       -fmax(f16), 10.0,       -fmin(f16), 0.1,        fmin(f16),  -0.0,      0.1,        -0.0,      -nan(f16),  -nan(f16),
+                inf(f16),   1.0,        -1.0,      0.1,        0.1,       0.1,       0.0,        -tmin(f16), -1.0,       -10.0,      -tmin(f16), 1.0,       -10.0,      fmin(f16), -fmax(f16), -nan(f16),
+                -tmin(f16), -inf(f16),  inf(f16),  -fmin(f16), -nan(f16), 0.0,       -inf(f16),  -fmax(f16), 0.1,        -inf(f16),  tmin(f16),  nan(f16),  tmin(f16),  fmin(f16), -0.0,       0.1,
+                fmin(f16),  fmin(f16),  1.0,       tmin(f16),  0.0,       10.0,      0.1,        inf(f16),   10.0,       -tmin(f16), tmin(f16),  -1.0,      -fmin(f16), 1.0,       nan(f16),   -fmax(f16),
+                nan(f16),   -fmin(f16), 0.1,       10.0,       -10.0,     1.0,       -0.0,       tmin(f16),  nan(f16),   inf(f16),   -fmax(f16), tmin(f16), -tmin(f16), 10.0,      fmin(f16),  -tmin(f16),
+                -0.0,       1.0,        tmin(f16), fmax(f16),  1.0,       -inf(f16), -nan(f16),  -0.0,       0.1,        -inf(f16),  0.1,        fmax(f16), -inf(f16),  -nan(f16), -1.0,       -inf(f16),
+                0.1,        fmin(f16),  -10.0,     -tmin(f16), 1.0,       -nan(f16), -fmax(f16), -10.0,      -tmin(f16), 10.0,       nan(f16),   fmin(f16), fmax(f16),  tmin(f16), -inf(f16),  1.0,
+                -fmin(f16), tmin(f16),  -1.0,      0.1,        0.0,       nan(f16),  1.0,        fmax(f16),  -1.0,       10.0,       nan(f16),   1.0,       fmin(f16),  1.0,       -10.0,      -10.0,
+            });
+            try testArgs(@Vector(69, f16), .{
+                -nan(f16), -1.0,      -fmin(f16), fmin(f16), inf(f16),  0.1,       0.0,       fmax(f16),  tmin(f16), 0.1,       0.0,        -tmin(f16), 0.0,        0.0,        1.0,        -inf(f16),
+                tmin(f16), -inf(f16), -tmin(f16), fmin(f16), -inf(f16), -nan(f16), tmin(f16), -tmin(f16), 0.1,       -1.0,      -tmin(f16), fmax(f16),  nan(f16),   -fmin(f16), fmin(f16),  10.0,
+                fmin(f16), -10.0,     0.0,        fmin(f16), fmax(f16), -nan(f16), fmax(f16), -fmax(f16), nan(f16),  -nan(f16), fmin(f16),  -10.0,      -fmin(f16), fmin(f16),  -fmin(f16), -nan(f16),
+                0.0,       -1.0,      fmax(f16),  0.1,       inf(f16),  1.0,       -1.0,      -0.0,       10.0,      0.1,       -fmax(f16), tmin(f16),  -inf(f16),  tmin(f16),  -fmax(f16), 0.1,
+                -10.0,     -0.0,      -fmax(f16), nan(f16),  fmax(f16),
+            }, .{
+                inf(f16),   -fmin(f16), 0.1,       0.1,       -0.0,       fmax(f16),  0.1,       -0.0,      0.0,       -0.0,       0.0,       -tmin(f16), tmin(f16), -1.0,     nan(f16),   -fmin(f16),
+                fmin(f16),  0.1,        0.1,       nan(f16),  -fmax(f16), -inf(f16),  -nan(f16), -nan(f16), 0.1,       -fmax(f16), fmin(f16), 0.1,        0.1,       0.1,      -0.0,       10.0,
+                tmin(f16),  -nan(f16),  fmin(f16), -1.0,      1.0,        -tmin(f16), 0.0,       nan(f16),  fmax(f16), -10.0,      fmin(f16), -fmin(f16), -1.0,      0.1,      -fmin(f16), -fmin(f16),
+                -fmax(f16), 0.0,        fmin(f16), -10.0,     -1.0,       -1.0,       fmax(f16), -nan(f16), -inf(f16), -inf(f16),  0.0,       tmin(f16),  -0.0,      nan(f16), -inf(f16),  nan(f16),
+                inf(f16),   fmin(f16),  -nan(f16), -inf(f16), inf(f16),
+            });
+
+            try testArgs(@Vector(1, f32), .{
+                fmin(f32),
+            }, .{
+                -tmin(f32),
+            });
+            try testArgs(@Vector(2, f32), .{
+                nan(f32), -10.0,
+            }, .{
+                -tmin(f32), fmin(f32),
+            });
+            try testArgs(@Vector(4, f32), .{
+                fmax(f32), -fmax(f32), -10.0, 0.0,
+            }, .{
+                inf(f32), inf(f32), -10.0, inf(f32),
+            });
+            try testArgs(@Vector(8, f32), .{
+                -10.0, fmax(f32), inf(f32), -0.0, -tmin(f32), -tmin(f32), 10.0, 0.1,
+            }, .{
+                10.0, -1.0, -1.0, inf(f32), 1.0, -tmin(f32), nan(f32), 10.0,
+            });
+            try testArgs(@Vector(16, f32), .{
+                0.1, 0.1, -nan(f32), -10.0, -nan(f32), 0.0, fmin(f32), fmin(f32), -10.0, 1.0, -fmax(f32), -0.0, inf(f32), -0.0, fmax(f32), -fmin(f32),
+            }, .{
+                nan(f32), 0.0, tmin(f32), -1.0, -10.0, -tmin(f32), fmin(f32), -fmax(f32), 0.1, 0.1, -inf(f32), tmin(f32), -0.0, 10.0, -0.0, -inf(f32),
+            });
+            try testArgs(@Vector(32, f32), .{
+                0.1,        tmin(f32), -1.0,       1.0,       tmin(f32), -10.0,     fmax(f32), 0.0,       tmin(f32),  0.1,       -1.0,     fmax(f32),  -nan(f32), -0.0,      fmin(f32), 0.0,
+                -fmax(f32), fmax(f32), -fmin(f32), -inf(f32), tmin(f32), -nan(f32), -1.0,      tmin(f32), -fmin(f32), -inf(f32), nan(f32), -tmin(f32), inf(f32),  -inf(f32), -nan(f32), 0.1,
+            }, .{
+                -fmin(f32), -1.0,      fmax(f32), inf(f32),   -fmin(f32), fmax(f32),  0.0,       -10.0, 0.0, 0.1,       fmin(f32), -inf(f32),  1.0, -nan(f32), -nan(f32),
+                -inf(f32),  -0.0,      nan(f32),  -fmax(f32), 10.0,       -tmin(f32), fmax(f32), -10.0, 0.1, tmin(f32), 0.1,       -fmax(f32), 0.0, 0.1,       -nan(f32),
+                -fmin(f32), fmax(f32),
+            });
+            try testArgs(@Vector(64, f32), .{
+                fmin(f32),  0.0, -inf(f32), 0.1,       -10.0,     -fmin(f32), 10.0,       nan(f32),  0.1,        1.0,       -1.0,      10.0,       10.0,      0.1,        -fmax(f32), -1.0,
+                -fmin(f32), 0.1, -inf(f32), -inf(f32), 0.1,       0.1,        0.0,        -1.0,      nan(f32),   -0.0,      -0.0,      -fmin(f32), -inf(f32), inf(f32),   tmin(f32),  -nan(f32),
+                0.1,        0.0, 1.0,       tmin(f32), 10.0,      fmin(f32),  -fmin(f32), fmax(f32), nan(f32),   1.0,       -nan(f32), -nan(f32),  1.0,       nan(f32),   1.0,        fmax(f32),
+                -0.0,       0.0, inf(f32),  nan(f32),  tmin(f32), 0.0,        fmin(f32),  -0.0,      -fmin(f32), tmin(f32), -1.0,      -10.0,      0.1,       -tmin(f32), -inf(f32),  -1.0,
+            }, .{
+                nan(f32),   -nan(f32),  -tmin(f32), inf(f32),   -inf(f32), 0.1,       0.1,        0.1,        -1.0,       -inf(f32),  -0.0,     fmax(f32), tmin(f32), -nan(f32),  -fmax(f32), -1.0,
+                -fmin(f32), -0.0,       fmax(f32),  -fmax(f32), 1.0,       -0.0,      0.0,        10.0,       -1.0,       -fmin(f32), 0.0,      fmax(f32), 0.1,       1.0,        10.0,       0.1,
+                0.1,        fmin(f32),  -nan(f32),  -inf(f32),  -0.0,      -inf(f32), 0.1,        -fmax(f32), -10.0,      -10.0,      nan(f32), 10.0,      -1.0,      -fmin(f32), 10.0,       fmin(f32),
+                1.0,        -fmax(f32), nan(f32),   inf(f32),   fmax(f32), fmax(f32), -fmin(f32), -inf(f32),  -tmin(f32), -nan(f32),  nan(f32), nan(f32),  0.1,       0.1,        -1.0,       inf(f32),
+            });
+            try testArgs(@Vector(128, f32), .{
+                -10.0,      -nan(f32),  inf(f32),   inf(f32),  -tmin(f32), -0.0,       0.0,        0.1,        -0.0,       fmin(f32),  nan(f32),   -1.0,       nan(f32),   -fmax(f32), nan(f32),   0.0,
+                1.0,        -tmin(f32), 0.0,        -nan(f32), 0.1,        0.1,        -1.0,       10.0,       -fmax(f32), -fmin(f32), 0.1,        nan(f32),   0.1,        -fmax(f32), -tmin(f32), -inf(f32),
+                inf(f32),   tmin(f32),  -tmin(f32), nan(f32),  -inf(f32),  -10.0,      1.0,        -nan(f32),  0.1,        nan(f32),   -1.0,       tmin(f32),  -fmin(f32), -0.0,       -0.0,       1.0,
+                fmin(f32),  -fmin(f32), 0.1,        0.1,       0.1,        -10.0,      -10.0,      -tmin(f32), 1.0,        -0.0,       10.0,       -fmax(f32), 10.0,       -fmax(f32), inf(f32),   -1.0,
+                -fmax(f32), fmin(f32),  fmin(f32),  fmin(f32), -1.0,       -nan(f32),  fmax(f32),  -nan(f32),  0.1,        -1.0,       -fmax(f32), -tmin(f32), -0.0,       fmax(f32),  -10.0,      inf(f32),
+                10.0,       -inf(f32),  0.1,        fmin(f32), nan(f32),   -fmax(f32), -tmin(f32), inf(f32),   tmin(f32),  -fmin(f32), fmax(f32),  1.0,        fmin(f32),  -0.0,       0.1,        fmin(f32),
+                0.1,        inf(f32),   -10.0,      inf(f32),  10.0,       tmin(f32),  0.0,        1.0,        inf(f32),   -10.0,      -fmin(f32), tmin(f32),  1.0,        0.1,        0.1,        -fmin(f32),
+                10.0,       0.1,        fmax(f32),  fmin(f32), 1.0,        -10.0,      -inf(f32),  -10.0,      0.0,        -fmax(f32), -inf(f32),  -1.0,       fmax(f32),  -tmin(f32), inf(f32),   nan(f32),
+            }, .{
+                -tmin(f32), -fmax(f32), -fmax(f32), 10.0,       inf(f32),  0.1,      1.0,        fmin(f32),  0.1,        10.0,       fmin(f32),  -fmax(f32), 1.0,        fmax(f32),  0.1,        -fmin(f32),
+                0.0,        -0.0,       -0.0,       -1.0,       -nan(f32), nan(f32), -tmin(f32), 10.0,       -tmin(f32), -10.0,      inf(f32),   0.0,        tmin(f32),  0.0,        -fmax(f32), inf(f32),
+                fmin(f32),  0.1,        -10.0,      tmin(f32),  tmin(f32), 0.1,      fmin(f32),  -tmin(f32), fmin(f32),  nan(f32),   0.1,        -fmax(f32), -1.0,       -0.0,       fmin(f32),  -0.0,
+                -1.0,       -0.0,       -inf(f32),  fmax(f32),  -10.0,     1.0,      inf(f32),   -1.0,       -tmin(f32), -tmin(f32), 0.1,        -10.0,      -fmin(f32), 10.0,       -10.0,      -inf(f32),
+                -1.0,       inf(f32),   0.1,        1.0,        -nan(f32), 0.1,      -10.0,      -nan(f32),  -tmin(f32), 0.0,        fmin(f32),  -nan(f32),  fmax(f32),  -tmin(f32), 0.0,        0.0,
+                -fmax(f32), -inf(f32),  -1.0,       -0.0,       10.0,      nan(f32), 0.1,        tmin(f32),  -10.0,      10.0,       tmin(f32),  -fmax(f32), 0.1,        -10.0,      -tmin(f32), fmax(f32),
+                -fmax(f32), 0.1,        -nan(f32),  -fmin(f32), inf(f32),  inf(f32), tmin(f32),  tmin(f32),  -tmin(f32), tmin(f32),  0.0,        -0.0,       1.0,        10.0,       -10.0,      inf(f32),
+                0.0,        -fmin(f32), fmax(f32),  -10.0,      fmax(f32), -0.0,     0.0,        -fmin(f32), 10.0,       -fmin(f32), -fmin(f32), -fmin(f32), 10.0,       fmin(f32),  -inf(f32),  fmax(f32),
+            });
+            try testArgs(@Vector(69, f32), .{
+                nan(f32),   0.1,       -tmin(f32), fmax(f32),  nan(f32),  -fmax(f32), 0.1,        fmax(f32), 10.0,       inf(f32), -fmin(f32), -fmax(f32), inf(f32),   -nan(f32),  0.1,        1.0,
+                fmax(f32),  0.1,       10.0,       0.0,        -10.0,     fmax(f32),  10.0,       0.0,       1.0,        10.0,     -fmax(f32), 0.0,        -tmin(f32), -fmin(f32), 0.1,        1.0,
+                fmin(f32),  tmin(f32), -fmin(f32), -tmin(f32), tmin(f32), -inf(f32),  -fmax(f32), -0.0,      -1.0,       -0.0,     -fmax(f32), fmax(f32),  fmin(f32),  -0.0,       0.0,        -inf(f32),
+                -tmin(f32), inf(f32),  -nan(f32),  tmin(f32),  -1.0,      -tmin(f32), 10.0,       -inf(f32), -fmin(f32), 0.1,      -inf(f32),  -1.0,       nan(f32),   -inf(f32),  -tmin(f32), 10.0,
+                10.0,       -nan(f32), -nan(f32),  tmin(f32),  -nan(f32),
+            }, .{
+                -nan(f32), 1.0,       fmax(f32), 0.1,        -0.0,       1.0,       -inf(f32), -fmin(f32), -nan(f32), inf(f32),   1.0,       -nan(f32), -nan(f32), -inf(f32), tmin(f32), -fmin(f32),
+                -nan(f32), 0.1,       fmin(f32), -1.0,       -fmax(f32), 0.1,       -1.0,      0.1,        0.1,       -tmin(f32), 0.1,       0.1,       10.0,      fmin(f32), 0.0,       nan(f32),
+                tmin(f32), 1.0,       nan(f32),  -fmin(f32), tmin(f32),  nan(f32),  0.1,       nan(f32),   1.0,       -fmax(f32), tmin(f32), 1.0,       0.0,       -1.0,      nan(f32),  fmin(f32),
+                -inf(f32), fmax(f32), -0.0,      nan(f32),   tmin(f32),  tmin(f32), -inf(f32), -10.0,      -nan(f32), -fmax(f32), -0.0,      0.1,       -inf(f32), 1.0,       nan(f32),  1.0,
+                -10.0,     fmin(f32), inf(f32),  fmin(f32),  0.0,
+            });
+
+            try testArgs(@Vector(1, f64), .{
+                -0.0,
+            }, .{
+                1.0,
+            });
+            try testArgs(@Vector(2, f64), .{
+                -1.0, 0.0,
+            }, .{
+                -inf(f64), -fmax(f64),
+            });
+            try testArgs(@Vector(4, f64), .{
+                -inf(f64), inf(f64), 10.0, 0.0,
+            }, .{
+                -tmin(f64), 1.0, nan(f64), 0.0,
+            });
+            try testArgs(@Vector(8, f64), .{
+                0.1, -tmin(f64), -fmax(f64), 1.0, inf(f64), -10.0, -tmin(f64), -10.0,
+            }, .{
+                tmin(f64), fmin(f64), 0.1, 10.0, -0.0, -0.0, fmax(f64), -1.0,
+            });
+            try testArgs(@Vector(16, f64), .{
+                0.1, -nan(f64), 1.0, tmin(f64), fmax(f64), -fmax(f64), -tmin(f64), -0.0, -fmin(f64), -1.0, -fmax(f64), -nan(f64), -fmax(f64), nan(f64), -0.0, 0.1,
+            }, .{
+                -1.0, -tmin(f64), -fmin(f64), 0.1, 0.1, -0.0, -nan(f64), -inf(f64), -inf(f64), -0.0, nan(f64), tmin(f64), 1.0, 0.1, tmin(f64), fmin(f64),
+            });
+            try testArgs(@Vector(32, f64), .{
+                -fmax(f64), fmin(f64), 0.1, 0.1,       0.0,       1.0,  -0.0, -tmin(f64), tmin(f64), inf(f64),  -tmin(f64), -tmin(f64), -tmin(f64), -fmax(f64), fmin(f64), 1.0,
+                -fmin(f64), -nan(f64), 1.0, -inf(f64), -nan(f64), -1.0, 0.0,  0.0,        nan(f64),  -nan(f64), -fmin(f64), fmin(f64),  0.1,        nan(f64),   tmin(f64), -fmax(f64),
+            }, .{
+                -tmin(f64), -fmax(f64), -inf(f64),  -nan(f64), fmin(f64), -inf(f64), 0.1,      -fmax(f64), -inf(f64), fmin(f64), inf(f64), -1.0, -tmin(f64), inf(f64), 0.1,  nan(f64),
+                fmin(f64),  10.0,       -tmin(f64), -nan(f64), -inf(f64), 1.0,       nan(f64), -fmin(f64), -1.0,      nan(f64),  -1.0,     0.0,  1.0,        nan(f64), -1.0, -fmin(f64),
+            });
+            try testArgs(@Vector(64, f64), .{
+                -10.0,     fmax(f64),  -nan(f64),  tmin(f64),  0.1,       -1.0,       1.0,      -0.0,      -fmin(f64), 0.1,       -fmin(f64), -0.0,      -0.0,      tmin(f64), -10.0,     0.1,
+                -10.0,     -fmax(f64), -10.0,      -fmin(f64), 0.0,       -10.0,      nan(f64), 1.0,       inf(f64),   inf(f64),  -inf(f64),  tmin(f64), tmin(f64), 0.1,       -0.0,      0.1,
+                -0.0,      0.1,        -10.0,      10.0,       fmax(f64), -fmin(f64), 1.0,      fmax(f64), 1.0,        -10.0,     fmin(f64),  fmax(f64), -1.0,      -0.0,      -0.0,      fmax(f64),
+                -inf(f64), -inf(f64),  -tmin(f64), -fmax(f64), -nan(f64), tmin(f64),  -1.0,     0.0,       -inf(f64),  fmax(f64), nan(f64),   -inf(f64), fmin(f64), -nan(f64), -nan(f64), -10.0,
+            }, .{
+                nan(f64),  -1.0, 0.0,       -10.0,      -fmax(f64), -fmin(f64), -nan(f64),  -tmin(f64), 0.1,        -1.0,      -nan(f64),  -fmax(f64), 0.0,       0.0,      10.0,      inf(f64),
+                fmin(f64), 0.0,  -10.0,     1.0,        -tmin(f64), -inf(f64),  -fmax(f64), 0.0,        -fmin(f64), -1.0,      -fmin(f64), tmin(f64),  1.0,       -10.0,    fmin(f64), 0.1,
+                inf(f64),  -0.0, tmin(f64), -fmax(f64), -tmin(f64), -fmax(f64), fmin(f64),  -fmax(f64), 0.1,        1.0,       1.0,        0.0,        fmin(f64), nan(f64), -10.0,     tmin(f64),
+                inf(f64),  0.1,  1.0,       -nan(f64),  1.0,        -fmin(f64), fmax(f64),  inf(f64),   fmin(f64),  -inf(f64), -0.0,       0.0,        -1.0,      -0.0,     0.1,       0.1,
+            });
+            try testArgs(@Vector(128, f64), .{
+                nan(f64),   -fmin(f64), fmax(f64),  fmin(f64), -10.0,      nan(f64),  tmin(f64), fmax(f64),  inf(f64),   -nan(f64),  tmin(f64),  -nan(f64), -0.0,       fmin(f64),  fmax(f64),
+                -inf(f64),  inf(f64),   -1.0,       0.0,       0.1,        fmin(f64), 0.0,       0.1,        -1.0,       -inf(f64),  0.1,        fmax(f64), fmin(f64),  fmax(f64),  -fmax(f64),
+                fmin(f64),  inf(f64),   -fmin(f64), -10.0,     -0.0,       0.1,       nan(f64),  -fmax(f64), -fmax(f64), -1.0,       10.0,       10.0,      -1.0,       -inf(f64),  inf(f64),
+                -fmin(f64), 1.0,        -inf(f64),  -10.0,     0.1,        1.0,       10.0,      10.0,       tmin(f64),  nan(f64),   inf(f64),   0.0,       -1.0,       -10.0,      1.0,
+                -tmin(f64), -fmax(f64), -nan(f64),  10.0,      0.1,        tmin(f64), 0.0,       10.0,       0.1,        -tmin(f64), -tmin(f64), 1.0,       -fmax(f64), nan(f64),   -fmin(f64),
+                nan(f64),   10.0,       -1.0,       -0.0,      -tmin(f64), nan(f64),  10.0,      10.0,       -inf(f64),  0.1,        -nan(f64),  -10.0,     -tmin(f64), -fmax(f64), -fmax(f64),
+                inf(f64),   -inf(f64),  tmin(f64),  1.0,       -inf(f64),  -10.0,     inf(f64),  0.1,        -nan(f64),  -inf(f64),  fmax(f64),  0.1,       -inf(f64),  0.1,        1.0,
+                0.1,        0.1,        0.1,        inf(f64),  -inf(f64),  1.0,       10.0,      10.0,       nan(f64),   10.0,       -tmin(f64), 1.0,       -fmin(f64), -1.0,       -fmax(f64),
+                -fmin(f64), -fmin(f64), -1.0,       inf(f64),  nan(f64),   tmin(f64), 0.1,       -1.0,
+            }, .{
+                0.0,       0.0,        inf(f64),  -0.0,       0.1,        -nan(f64),  10.0,       -nan(f64), tmin(f64),  -10.0,      -0.0,      inf(f64),   -fmin(f64), 0.1,        fmax(f64),
+                nan(f64),  -tmin(f64), tmin(f64), 1.0,        0.1,        -10.0,      -nan(f64),  1.0,       inf(f64),   -10.0,      fmin(f64), 0.1,        10.0,       -10.0,      10.0,
+                -nan(f64), -nan(f64),  0.1,       0.0,        10.0,       -fmax(f64), -tmin(f64), tmin(f64), -1.0,       -tmin(f64), -10.0,     0.1,        -fmax(f64), 10.0,       nan(f64),
+                fmax(f64), -1.0,       -1.0,      -tmin(f64), fmax(f64),  -10.0,      0.1,        1.0,       fmin(f64),  inf(f64),   0.1,       tmin(f64),  0.1,        -fmax(f64), fmax(f64),
+                -10.0,     -fmax(f64), fmax(f64), tmin(f64),  -fmin(f64), inf(f64),   0.1,        -0.0,      fmax(f64),  tmin(f64),  0.1,       1.0,        -inf(f64),  1.0,        10.0,
+                0.1,       0.0,        -10.0,     -nan(f64),  10.0,       -fmin(f64), -tmin(f64), 10.0,      1.0,        -tmin(f64), -1.0,      -fmin(f64), -0.0,       -10.0,      0.1,
+                inf(f64),  -fmax(f64), 0.1,       tmin(f64),  -0.0,       fmax(f64),  0.0,        -nan(f64), -fmin(f64), fmax(f64),  -0.0,      nan(f64),   -inf(f64),  tmin(f64),  0.1,
+                inf(f64),  0.0,        10.0,      -fmax(f64), tmin(f64),  -0.0,       fmin(f64),  -nan(f64), -10.0,      -inf(f64),  nan(f64),  inf(f64),   -0.0,       10.0,       fmax(f64),
+                tmin(f64), -10.0,      -nan(f64), 10.0,       -inf(f64),  -fmax(f64), -inf(f64),  -1.0,
+            });
+            try testArgs(@Vector(69, f64), .{
+                inf(f64),   -0.0,      -fmax(f64), fmax(f64),  fmax(f64), 0.0,      fmin(f64), -nan(f64), 0.1,       0.1,       0.1,        -fmin(f64), inf(f64),   0.1,       fmax(f64),  nan(f64),
+                tmin(f64),  -10.0,     10.0,       -tmin(f64), -0.0,      nan(f64), -10.0,     fmin(f64), 0.0,       -0.0,      0.1,        inf(f64),   -tmin(f64), -nan(f64), inf(f64),   -nan(f64),
+                -inf(f64),  fmax(f64), 0.1,        -fmin(f64), 0.1,       -1.0,     fmin(f64), fmin(f64), fmin(f64), 10.0,      -fmin(f64), nan(f64),   0.0,        0.0,       10.0,       nan(f64),
+                -tmin(f64), tmin(f64), tmin(f64),  fmin(f64),  -0.0,      -1.0,     0.1,       1.0,       fmax(f64), tmin(f64), fmin(f64),  0.0,        -fmin(f64), fmin(f64), -tmin(f64), 0.0,
+                -nan(f64),  10.0,      -1.0,       0.1,        0.0,
+            }, .{
+                -10.0,      -0.0,       fmin(f64), -fmin(f64), nan(f64),  10.0,     -tmin(f64), -fmax(f64), 10.0,      0.1,      -fmin(f64), inf(f64),  -inf(f64),  -tmin(f64), 1.0,        tmin(f64),
+                -tmin(f64), -nan(f64),  fmax(f64), 0.0,        -1.0,      10.0,     inf(f64),   fmin(f64),  fmax(f64), 0.1,      0.1,        fmax(f64), -inf(f64),  0.1,        0.1,        fmin(f64),
+                0.1,        fmin(f64),  -10.0,     nan(f64),   0.0,       0.0,      fmax(f64),  -inf(f64),  tmin(f64), inf(f64), -tmin(f64), fmax(f64), -inf(f64),  -10.0,      -1.0,       fmin(f64),
+                0.1,        -nan(f64),  fmax(f64), -fmin(f64), fmax(f64), nan(f64), -0.0,       -fmax(f64), 10.0,      nan(f64), inf(f64),   -1.0,      -fmin(f64), nan(f64),   -fmin(f64), -0.0,
+                -nan(f64),  -fmin(f64), 0.1,       nan(f64),   0.1,
+            });
+
+            try testArgs(@Vector(1, f80), .{
+                -nan(f80),
+            }, .{
+                -1.0,
+            });
+            try testArgs(@Vector(2, f80), .{
+                -fmax(f80), -inf(f80),
+            }, .{
+                0.1, 10.0,
+            });
+            try testArgs(@Vector(4, f80), .{
+                -0.0, -inf(f80), 0.1, 10.0,
+            }, .{
+                -1.0, 0.0, 0.1, -10.0,
+            });
+            try testArgs(@Vector(8, f80), .{
+                1.0, -0.0, -inf(f80), 0.1, -inf(f80), fmin(f80), 0.0, 10.0,
+            }, .{
+                -0.0, -fmin(f80), fmin(f80), -nan(f80), nan(f80), inf(f80), fmin(f80), 10.0,
+            });
+            try testArgs(@Vector(16, f80), .{
+                10.0, inf(f80), -fmin(f80), 0.1, -tmin(f80), -0.0, -inf(f80), -1.0, -fmax(f80), -nan(f80), -tmin(f80), 10.0, 10.0, -inf(f80), -fmax(f80), fmax(f80),
+            }, .{
+                -inf(f80), nan(f80), -fmax(f80), fmin(f80), 1.0, 0.1, -inf(f80), nan(f80), 0.1, nan(f80), -inf(f80), nan(f80), tmin(f80), 0.1, -tmin(f80), -10.0,
+            });
+            try testArgs(@Vector(32, f80), .{
+                inf(f80),  -0.0, 0.1,      -0.0, 0.1,      -fmin(f80), -0.0,       fmax(f80), nan(f80),  -tmin(f80), nan(f80), -10.0,      0.0,       1.0,        10.0, -fmin(f80),
+                fmin(f80), 0.1,  inf(f80), -0.0, nan(f80), tmin(f80),  -tmin(f80), fmin(f80), tmin(f80), -0.0,       nan(f80), -fmax(f80), tmin(f80), -fmin(f80), 1.0,  tmin(f80),
+            }, .{
+                0.0, -10.0,    fmax(f80), -inf(f80),  0.1,       -inf(f80), inf(f80),   10.0, -1.0,  -10.0,     -fmin(f80), 0.0,  inf(f80),   1.0,        -nan(f80), 0.0,
+                0.1, nan(f80), 1.0,       -fmax(f80), fmin(f80), -inf(f80), -fmax(f80), 0.1,  -10.0, tmin(f80), fmax(f80),  -0.0, -fmin(f80), -fmin(f80), fmin(f80), -tmin(f80),
+            });
+            try testArgs(@Vector(64, f80), .{
+                -fmax(f80), 0.1,       -1.0,       1.0,        inf(f80),   0.1,       -10.0,     0.1,       fmin(f80), -fmin(f80), -10.0,     -fmax(f80), 0.0,        -10.0,     -1.0,       -nan(f80),
+                0.0,        0.1,       -1.0,       -tmin(f80), 1.0,        tmin(f80), fmax(f80), 0.0,       -10.0,     -tmin(f80), fmax(f80), -0.0,       0.1,        -inf(f80), -fmax(f80), -1.0,
+                -nan(f80),  tmin(f80), -tmin(f80), -0.0,       -0.0,       -1.0,      -0.0,      fmax(f80), inf(f80),  -nan(f80),  0.1,       -inf(f80),  -tmin(f80), nan(f80),  0.1,        10.0,
+                nan(f80),   -inf(f80), 0.1,        tmin(f80),  -fmin(f80), 10.0,      -10.0,     tmin(f80), fmin(f80), nan(f80),   0.1,       -nan(f80),  tmin(f80),  nan(f80),  fmax(f80),  -fmax(f80),
+            }, .{
+                -nan(f80), -fmax(f80), tmin(f80), -inf(f80),  -tmin(f80), fmin(f80), -nan(f80), -fmin(f80), fmax(f80), inf(f80), -0.0,      -1.0, 0.1,        -fmax(f80), 1.0,       -inf(f80),
+                0.0,       -nan(f80),  -10.0,     -1.0,       -nan(f80),  inf(f80),  1.0,       -nan(f80),  10.0,      inf(f80), tmin(f80), 0.1,  tmin(f80),  -tmin(f80), -inf(f80), -fmin(f80),
+                fmax(f80), fmax(f80),  0.1,       -tmin(f80), -nan(f80),  -1.0,      fmin(f80), -nan(f80),  -nan(f80), inf(f80), -1.0,      0.1,  -fmin(f80), -tmin(f80), 0.0,       -0.0,
+                0.1,       -fmin(f80), -inf(f80), -1.0,       -tmin(f80), 1.0,       -inf(f80), -0.0,       0.0,       1.0,      tmin(f80), 0.0,  0.1,        -nan(f80),  fmax(f80), 1.0,
+            });
+            try testArgs(@Vector(128, f80), .{
+                0.1,       -0.0,       0.1,        0.0,        fmin(f80),  -1.0,      1.0,       -inf(f80),  fmax(f80),  -fmin(f80), nan(f80),   10.0,       0.1,        0.1,        -fmin(f80), -inf(f80),
+                -1.0,      -inf(f80),  1.0,        -fmin(f80), inf(f80),   -nan(f80), 10.0,      inf(f80),   tmin(f80),  nan(f80),   -10.0,      inf(f80),   10.0,       inf(f80),   -10.0,      0.0,
+                -10.0,     fmin(f80),  -tmin(f80), 1.0,        -fmax(f80), nan(f80),  0.0,       fmax(f80),  0.1,        -1.0,       -fmin(f80), inf(f80),   -tmin(f80), nan(f80),   -tmin(f80), 10.0,
+                -10.0,     -tmin(f80), -1.0,       -tmin(f80), -fmax(f80), 10.0,      -1.0,      -inf(f80),  -nan(f80),  0.0,        1.0,        fmax(f80),  -tmin(f80), -fmin(f80), fmin(f80),  fmin(f80),
+                -10.0,     -fmax(f80), -tmin(f80), inf(f80),   1.0,        0.0,       tmin(f80), -nan(f80),  -fmin(f80), 0.1,        -nan(f80),  0.0,        0.1,        -10.0,      -0.0,       -nan(f80),
+                1.0,       10.0,       -10.0,      fmin(f80),  -nan(f80),  fmax(f80), -0.0,      1.0,        inf(f80),   1.0,        -fmin(f80), -fmin(f80), 0.0,        0.1,        inf(f80),   10.0,
+                tmin(f80), -1.0,       fmax(f80),  -0.0,       fmax(f80),  fmax(f80), 0.1,       -fmin(f80), -10.0,      1.0,        -fmin(f80), -fmax(f80), fmin(f80),  -fmax(f80), -0.0,       -1.0,
+                -nan(f80), -inf(f80),  nan(f80),   -fmax(f80), inf(f80),   -inf(f80), -nan(f80), fmin(f80),  nan(f80),   -1.0,       tmin(f80),  tmin(f80),  0.1,        10.0,       -tmin(f80), -nan(f80),
+            }, .{
+                -1.0,       -0.0,      0.0,        fmax(f80),  -1.0,       -0.0,       0.1,        tmin(f80),  -inf(f80),  10.0,       -0.0,       0.1,       -tmin(f80), -fmax(f80), tmin(f80), inf(f80),
+                0.1,        1.0,       tmin(f80),  nan(f80),   -fmax(f80), 10.0,       fmin(f80),  -1.0,       -fmax(f80), nan(f80),   -fmin(f80), 10.0,      -1.0,       tmin(f80),  inf(f80),  -0.0,
+                tmin(f80),  1.0,       0.0,        -fmin(f80), 0.0,        10.0,       -fmax(f80), -0.0,       -inf(f80),  fmin(f80),  -0.0,       -0.0,      -0.0,       -fmax(f80), 0.1,       fmax(f80),
+                -tmin(f80), tmin(f80), -fmax(f80), 10.0,       -fmax(f80), 0.1,        fmax(f80),  -10.0,      0.1,        1.0,        -1.0,       -1.0,      nan(f80),   -nan(f80),  10.0,      -nan(f80),
+                nan(f80),   -10.0,     -tmin(f80), fmin(f80),  -tmin(f80), -fmin(f80), tmin(f80),  -0.0,       0.1,        fmax(f80),  tmin(f80),  tmin(f80), nan(f80),   0.1,        10.0,      0.1,
+                inf(f80),   inf(f80),  1.0,        -inf(f80),  -fmax(f80), 0.0,        1.0,        -fmax(f80), fmax(f80),  nan(f80),   fmin(f80),  0.1,       -1.0,       1.0,        0.1,       -tmin(f80),
+                10.0,       0.1,       -fmax(f80), 0.0,        nan(f80),   -tmin(f80), 0.1,        fmax(f80),  fmax(f80),  0.1,        -1.0,       inf(f80),  nan(f80),   10.0,       fmax(f80), -nan(f80),
+                -10.0,      -1.0,      tmin(f80),  fmin(f80),  inf(f80),   fmax(f80),  -fmin(f80), fmin(f80),  -inf(f80),  -tmin(f80), 1.0,        nan(f80),  -fmin(f80), -fmin(f80), fmax(f80), 1.0,
+            });
+            try testArgs(@Vector(69, f80), .{
+                -10.0,      tmin(f80), 0.1,        -nan(f80), -inf(f80), -nan(f80), fmin(f80), -0.0,       10.0,  fmax(f80), -fmin(f80), 0.1,        -nan(f80),  inf(f80), 1.0,       -1.0,
+                inf(f80),   fmin(f80), -fmax(f80), 0.1,       nan(f80),  0.0,       0.0,       nan(f80),   -10.0, fmax(f80), fmin(f80),  -fmax(f80), 1.0,        0.1,      0.0,       -fmin(f80),
+                -tmin(f80), 0.0,       -10.0,      fmin(f80), 1.0,       10.0,      0.1,       nan(f80),   -10.0, fmax(f80), 0.1,        fmin(f80),  -inf(f80),  0.0,      tmin(f80), inf(f80),
+                fmax(f80),  1.0,       0.1,        nan(f80),  inf(f80),  tmin(f80), tmin(f80), -fmax(f80), 0.0,   fmin(f80), -inf(f80),  0.1,        -tmin(f80), 0.1,      -1.0,      0.1,
+                -fmax(f80), -1.0,      0.1,        -1.0,      fmax(f80),
+            }, .{
+                -1.0,      fmin(f80),  inf(f80),   -nan(f80), -0.0,       fmin(f80),  -0.0, nan(f80),  -fmax(f80), 0.1,        1.0,        -10.0,      -tmin(f80), -fmin(f80), 10.0,      inf(f80),
+                -10.0,     -tmin(f80), -fmin(f80), 10.0,      0.0,        -tmin(f80), 10.0, -10.0,     0.1,        0.1,        tmin(f80),  fmax(f80),  0.0,        0.1,        0.1,       -10.0,
+                fmin(f80), nan(f80),   -10.0,      -10.0,     -10.0,      0.0,        -0.0, 0.1,       fmin(f80),  fmin(f80),  -0.0,       -fmin(f80), -nan(f80),  -inf(f80),  0.0,       -inf(f80),
+                inf(f80),  fmax(f80),  -tmin(f80), inf(f80),  0.1,        -nan(f80),  0.1,  tmin(f80), -10.0,      -fmax(f80), -fmax(f80), inf(f80),   -nan(f80),  1.0,        -inf(f80), 10.0,
+                nan(f80),  10.0,       -10.0,      0.0,       -fmin(f80),
+            });
+
+            try testArgs(@Vector(1, f128), .{
+                -nan(f128),
+            }, .{
+                -0.0,
+            });
+            try testArgs(@Vector(2, f128), .{
+                0.0, -inf(f128),
+            }, .{
+                0.1, -fmin(f128),
+            });
+            try testArgs(@Vector(4, f128), .{
+                0.1, fmax(f128), 10.0, -fmax(f128),
+            }, .{
+                -tmin(f128), fmax(f128), -0.0, -0.0,
+            });
+            try testArgs(@Vector(8, f128), .{
+                10.0, -fmin(f128), 0.0, -inf(f128), 10.0, -0.0, -1.0, -fmin(f128),
+            }, .{
+                fmin(f128), tmin(f128), -1.0, -10.0, 0.0, -tmin(f128), 0.0, 0.1,
+            });
+            try testArgs(@Vector(16, f128), .{
+                -fmin(f128), -10.0, -fmin(f128), 0.1, -10.0, 1.0, -fmax(f128), tmin(f128), -nan(f128), -tmin(f128), 10.0, -inf(f128), -1.0, tmin(f128), -0.0, nan(f128),
+            }, .{
+                -fmax(f128), fmin(f128), inf(f128), tmin(f128), -10.0, 10.0, fmax(f128), 1.0, -inf(f128), -inf(f128), -fmax(f128), -nan(f128), 1.0, -inf(f128), tmin(f128), tmin(f128),
+            });
+            try testArgs(@Vector(32, f128), .{
+                -0.0,       -1.0, 1.0,        -fmax(f128), -fmax(f128), 0.1,         -fmin(f128), -fmin(f128), -1.0,       -tmin(f128), -0.0,       -fmax(f128), tmin(f128), inf(f128), 0.0,  fmax(f128),
+                -nan(f128), -0.0, -inf(f128), -1.0,        0.1,         -fmin(f128), tmin(f128),  -10.0,       fmax(f128), -nan(f128),  -nan(f128), -fmax(f128), 0.1,        inf(f128), -0.0, tmin(f128),
+            }, .{
+                -1.0,       -10.0,      -fmin(f128), -fmin(f128), inf(f128),  tmin(f128), nan(f128), 0.0,        -fmin(f128), 0.1, -nan(f128), 0.1, -0.0, tmin(f128), 1.0,         0.0,
+                fmin(f128), fmax(f128), -fmax(f128), -tmin(f128), fmin(f128), -0.0,       -1.0,      -nan(f128), -inf(f128),  1.0, nan(f128),  1.0, 0.1,  -0.0,       -fmax(f128), -10.0,
+            });
+            try testArgs(@Vector(64, f128), .{
+                -1.0,       -0.0,       nan(f128),   0.1,         -10.0,       0.0,         1.0,         1.0,       -inf(f128), fmin(f128),  fmax(f128), nan(f128),  -nan(f128), inf(f128),   -0.0,
+                0.1,        -inf(f128), -fmax(f128), 10.0,        -tmin(f128), -tmin(f128), -fmax(f128), 1.0,       0.1,        0.1,         nan(f128),  10.0,       1.0,        -tmin(f128), 10.0,
+                -nan(f128), fmax(f128), fmax(f128),  0.0,         fmax(f128),  inf(f128),   1.0,         -0.0,      0.1,        -tmin(f128), fmin(f128), fmax(f128), tmin(f128), inf(f128),   -10.0,
+                -1.0,       -1.0,       -1.0,        -inf(f128),  10.0,        -tmin(f128), nan(f128),   nan(f128), 0.1,        fmin(f128),  0.1,        tmin(f128), -10.0,      0.1,         10.0,
+                fmax(f128), fmax(f128), 0.1,         -fmax(f128),
+            }, .{
+                -0.0,      0.1,        -0.0,      -fmin(f128), 10.0, 0.0,        1.0,         -inf(f128), tmin(f128),  -1.0,      fmin(f128),  -nan(f128), -10.0,      0.1,        -10.0,      0.1,
+                0.1,       tmin(f128), nan(f128), -1.0,        0.0,  -10.0,      -10.0,       fmax(f128), -fmax(f128), inf(f128), -nan(f128),  0.1,        -nan(f128), 1.0,        fmax(f128), inf(f128),
+                nan(f128), fmin(f128), 10.0,      inf(f128),   0.0,  -inf(f128), 0.1,         0.1,        0.1,         -1.0,      0.1,         -10.0,      inf(f128),  -nan(f128), 0.1,        inf(f128),
+                inf(f128), inf(f128),  -10.0,     -tmin(f128), 0.1,  -inf(f128), -fmin(f128), 1.0,        -tmin(f128), 1.0,       -tmin(f128), -inf(f128), -0.0,       -nan(f128), -1.0,       -fmax(f128),
+            });
+            try testArgs(@Vector(128, f128), .{
+                -inf(f128),  tmin(f128),  -fmax(f128), 1.0,         fmin(f128),  -fmax(f128), -1.0,        0.1,         -fmax(f128), -fmin(f128), -10.0,       nan(f128),   0.1,        nan(f128),
+                inf(f128),   -1.0,        tmin(f128),  -inf(f128),  0.0,         fmax(f128),  tmin(f128),  -fmin(f128), fmin(f128),  -10.0,       -fmin(f128), -10.0,       1.0,        -nan(f128),
+                -inf(f128),  fmin(f128),  inf(f128),   -tmin(f128), 0.1,         0.0,         10.0,        1.0,         -tmin(f128), -tmin(f128), tmin(f128),  1.0,         fmin(f128), 0.1,
+                0.1,         0.1,         fmax(f128),  0.1,         inf(f128),   0.0,         fmin(f128),  -fmin(f128), 10.0,        10.0,        -10.0,       tmin(f128),  inf(f128),  inf(f128),
+                -fmin(f128), 0.0,         0.1,         -nan(f128),  0.1,         -inf(f128),  -nan(f128),  -1.0,        fmin(f128),  -0.0,        10.0,        -tmin(f128), 10.0,       1.0,
+                0.1,         -0.0,        -tmin(f128), 0.1,         -1.0,        -tmin(f128), -fmin(f128), tmin(f128),  0.1,         -tmin(f128), -nan(f128),  -10.0,       -inf(f128), 0.0,
+                0.1,         0.0,         -fmin(f128), 0.0,         10.0,        10.0,        tmin(f128),  inf(f128),   -nan(f128),  -inf(f128),  -1.0,        -fmin(f128), -10.0,      -fmin(f128),
+                -inf(f128),  -fmax(f128), tmin(f128),  tmin(f128),  -fmin(f128), 0.1,         fmin(f128),  fmin(f128),  -fmin(f128), nan(f128),   -1.0,        -0.0,        -0.0,       0.1,
+                fmax(f128),  0.0,         -fmax(f128), nan(f128),   nan(f128),   nan(f128),   nan(f128),   -nan(f128),  fmin(f128),  -inf(f128),  inf(f128),   -fmax(f128), -10.0,      fmin(f128),
+                0.1,         fmax(f128),
+            }, .{
+                0.0,         10.0,        0.1,         inf(f128),   -0.0,        -1.0,        nan(f128),  -10.0,       -inf(f128),  0.1,         -tmin(f128), 1.0,         inf(f128),   0.1,         -1.0,
+                10.0,        0.0,         1.0,         nan(f128),   tmin(f128),  fmax(f128),  10.0,       0.1,         0.1,         -fmin(f128), -inf(f128),  -nan(f128),  -fmin(f128), -0.0,        -inf(f128),
+                -nan(f128),  fmax(f128),  -fmin(f128), -tmin(f128), -fmin(f128), -fmax(f128), nan(f128),  fmin(f128),  -fmax(f128), fmax(f128),  1.0,         10.0,        -fmax(f128), nan(f128),   -fmax(f128),
+                -inf(f128),  nan(f128),   -nan(f128),  tmin(f128),  -1.0,        0.1,         0.1,        -1.0,        -nan(f128),  fmax(f128),  10.0,        -inf(f128),  10.0,        -0.0,        -1.0,
+                -0.0,        -tmin(f128), 10.0,        -1.0,        -fmax(f128), fmin(f128),  fmax(f128), tmin(f128),  10.0,        fmin(f128),  -nan(f128),  1.0,         -tmin(f128), -1.0,        fmax(f128),
+                1.0,         -tmin(f128), 0.1,         -nan(f128),  inf(f128),   0.1,         0.1,        fmax(f128),  -fmin(f128), fmin(f128),  -0.0,        fmax(f128),  -fmax(f128), -tmin(f128), tmin(f128),
+                nan(f128),   0.1,         tmin(f128),  -1.0,        fmin(f128),  -nan(f128),  fmax(f128), 1.0,         nan(f128),   -nan(f128),  inf(f128),   -fmin(f128), fmin(f128),  0.1,         10.0,
+                -tmin(f128), -10.0,       0.0,         0.1,         -fmin(f128), -0.0,        0.0,        -10.0,       fmax(f128),  nan(f128),   nan(f128),   -fmin(f128), -fmax(f128), 10.0,        0.0,
+                fmin(f128),  10.0,        -tmin(f128), -tmin(f128), 0.0,         -10.0,       1.0,        -fmin(f128),
+            });
+            try testArgs(@Vector(69, f128), .{
+                -1.0,       nan(f128),  0.1,        0.1,        0.1,        -1.0, -10.0,      inf(f128), -0.0,       inf(f128),  tmin(f128),  0.0,         -fmax(f128), -tmin(f128), -10.0,       -fmax(f128),
+                -0.0,       0.0,        nan(f128),  inf(f128),  1.0,        -1.0, 0.1,        -0.0,      1.0,        fmax(f128), -fmax(f128), 0.0,         inf(f128),   -inf(f128),  -tmin(f128), -inf(f128),
+                10.0,       fmin(f128), 10.0,       -10.0,      0.1,        1.0,  -0.0,       nan(f128), tmin(f128), inf(f128),  inf(f128),   -nan(f128),  -nan(f128),  1.0,         -tmin(f128), 0.0,
+                fmin(f128), fmax(f128), fmin(f128), -10.0,      nan(f128),  0.0,  -nan(f128), -0.0,      -nan(f128), 0.1,        -10.0,       -tmin(f128), fmax(f128),  1.0,         fmin(f128),  fmax(f128),
+                nan(f128),  -inf(f128), 1.0,        fmin(f128), -nan(f128),
+            }, .{
+                -inf(f128), fmax(f128), 0.0,        nan(f128),   -10.0,       tmin(f128),  nan(f128),  1.0,       10.0,        -fmin(f128), fmin(f128),  tmin(f128),  0.0,         -fmin(f128), -0.0,        fmin(f128),
+                inf(f128),  inf(f128),  fmin(f128), fmin(f128),  -tmin(f128), -fmax(f128), 10.0,       nan(f128), -0.0,        1.0,         10.0,        -10.0,       -inf(f128),  fmin(f128),  -fmax(f128), 0.1,
+                -1.0,       -nan(f128), -10.0,      tmin(f128),  inf(f128),   nan(f128),   0.0,        -10.0,     tmin(f128),  0.0,         -fmax(f128), -tmin(f128), 0.1,         0.1,         10.0,        0.1,
+                fmax(f128), 0.1,        0.0,        -fmin(f128), -inf(f128),  -inf(f128),  -nan(f128), 0.1,       -fmax(f128), fmax(f128),  -fmax(f128), -0.0,        -tmin(f128), -1.0,        nan(f128),   0.1,
+                -1.0,       -inf(f128), tmin(f128), inf(f128),   inf(f128),
+            });
+        }
     };
 }
 
@@ -3848,6 +4238,7 @@ test min {
     try t.testIntTypes();
     try t.testIntVectorTypes();
     try t.testFloatTypes();
+    try t.testFloatVectorTypes();
 }
 
 inline fn max(comptime Type: type, lhs: Type, rhs: Type) Type {
@@ -3858,4 +4249,5 @@ test max {
     try t.testIntTypes();
     try t.testIntVectorTypes();
     try t.testFloatTypes();
+    try t.testFloatVectorTypes();
 }