Commit d652dd0658

Jacob Young <jacobly0@users.noreply.github.com>
2025-01-21 03:18:56
x86_64: rewrite `@abs` for scalar floats
1 parent f1ce1af
Changed files (4)
src
test
behavior
x86_64
src/arch/x86_64/abi.zig
@@ -427,7 +427,7 @@ pub const zigcc = struct {
 
     const int_param_regs = gp_regs[0 .. volatile_gpr - 1];
     const x87_param_regs = x87_regs[0..volatile_x87];
-    const sse_param_regs = sse_avx_regs[0..volatile_sse];
+    const sse_param_regs = sse_avx_regs[0 .. volatile_sse / 2];
     const int_return_regs = gp_regs[0..volatile_gpr];
     const x87_return_regs = x87_regs[0..volatile_x87];
     const sse_return_regs = sse_avx_regs[0..volatile_gpr];
@@ -443,11 +443,11 @@ pub const SysV = struct {
     pub const caller_preserved_regs = [_]Register{ .rax, .rcx, .rdx, .rsi, .rdi, .r8, .r9, .r10, .r11 } ++ x87_regs ++ sse_avx_regs;
 
     pub const c_abi_int_param_regs = [_]Register{ .rdi, .rsi, .rdx, .rcx, .r8, .r9 };
-    pub const c_abi_x87_param_regs = x87_regs[0..0].*;
-    pub const c_abi_sse_param_regs = sse_avx_regs[0..8].*;
+    pub const c_abi_x87_param_regs = x87_regs[0..0];
+    pub const c_abi_sse_param_regs = sse_avx_regs[0..8];
     pub const c_abi_int_return_regs = [_]Register{ .rax, .rdx };
-    pub const c_abi_x87_return_regs = x87_regs[0..2].*;
-    pub const c_abi_sse_return_regs = sse_avx_regs[0..4].*;
+    pub const c_abi_x87_return_regs = x87_regs[0..2];
+    pub const c_abi_sse_return_regs = sse_avx_regs[0..4];
 };
 
 pub const Win64 = struct {
@@ -460,11 +460,11 @@ pub const Win64 = struct {
     pub const caller_preserved_regs = [_]Register{ .rax, .rcx, .rdx, .r8, .r9, .r10, .r11 } ++ x87_regs ++ sse_avx_regs;
 
     pub const c_abi_int_param_regs = [_]Register{ .rcx, .rdx, .r8, .r9 };
-    pub const c_abi_x87_param_regs = x87_regs[0..0].*;
-    pub const c_abi_sse_param_regs = sse_avx_regs[0..4].*;
+    pub const c_abi_x87_param_regs = x87_regs[0..0];
+    pub const c_abi_sse_param_regs = sse_avx_regs[0..4];
     pub const c_abi_int_return_regs = [_]Register{.rax};
-    pub const c_abi_x87_return_regs = x87_regs[0..0].*;
-    pub const c_abi_sse_return_regs = sse_avx_regs[0..1].*;
+    pub const c_abi_x87_return_regs = x87_regs[0..0];
+    pub const c_abi_sse_return_regs = sse_avx_regs[0..1];
 };
 
 pub fn getCalleePreservedRegs(cc: std.builtin.CallingConvention.Tag) []const Register {
@@ -497,17 +497,21 @@ pub fn getCAbiIntParamRegs(cc: std.builtin.CallingConvention.Tag) []const Regist
 pub fn getCAbiX87ParamRegs(cc: std.builtin.CallingConvention.Tag) []const Register {
     return switch (cc) {
         .auto => zigcc.x87_param_regs,
-        .x86_64_sysv => &SysV.c_abi_x87_param_regs,
-        .x86_64_win => &Win64.c_abi_x87_param_regs,
+        .x86_64_sysv => SysV.c_abi_x87_param_regs,
+        .x86_64_win => Win64.c_abi_x87_param_regs,
         else => unreachable,
     };
 }
 
-pub fn getCAbiSseParamRegs(cc: std.builtin.CallingConvention.Tag) []const Register {
+pub fn getCAbiSseParamRegs(cc: std.builtin.CallingConvention.Tag, target: *const std.Target) []const Register {
     return switch (cc) {
-        .auto => zigcc.sse_param_regs,
-        .x86_64_sysv => &SysV.c_abi_sse_param_regs,
-        .x86_64_win => &Win64.c_abi_sse_param_regs,
+        .auto => switch (target.cpu.arch) {
+            else => unreachable,
+            .x86 => zigcc.sse_param_regs[0 .. zigcc.sse_param_regs.len / 2],
+            .x86_64 => zigcc.sse_param_regs,
+        },
+        .x86_64_sysv => SysV.c_abi_sse_param_regs,
+        .x86_64_win => Win64.c_abi_sse_param_regs,
         else => unreachable,
     };
 }
@@ -524,8 +528,8 @@ pub fn getCAbiIntReturnRegs(cc: std.builtin.CallingConvention.Tag) []const Regis
 pub fn getCAbiX87ReturnRegs(cc: std.builtin.CallingConvention.Tag) []const Register {
     return switch (cc) {
         .auto => zigcc.x87_return_regs,
-        .x86_64_sysv => &SysV.c_abi_x87_return_regs,
-        .x86_64_win => &Win64.c_abi_x87_return_regs,
+        .x86_64_sysv => SysV.c_abi_x87_return_regs,
+        .x86_64_win => Win64.c_abi_x87_return_regs,
         else => unreachable,
     };
 }
@@ -533,8 +537,8 @@ pub fn getCAbiX87ReturnRegs(cc: std.builtin.CallingConvention.Tag) []const Regis
 pub fn getCAbiSseReturnRegs(cc: std.builtin.CallingConvention.Tag) []const Register {
     return switch (cc) {
         .auto => zigcc.sse_return_regs,
-        .x86_64_sysv => &SysV.c_abi_sse_return_regs,
-        .x86_64_win => &Win64.c_abi_sse_return_regs,
+        .x86_64_sysv => SysV.c_abi_sse_return_regs,
+        .x86_64_win => Win64.c_abi_sse_return_regs,
         else => unreachable,
     };
 }
src/arch/x86_64/CodeGen.zig
@@ -3257,7 +3257,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -3288,7 +3288,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -3319,7 +3319,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -3338,7 +3338,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -3369,7 +3369,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -3400,7 +3400,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -3988,7 +3988,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -4019,7 +4019,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -4050,7 +4050,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -4069,7 +4069,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -4100,7 +4100,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -4131,7 +4131,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     },
                     .extra_temps = .{
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                         .unused,
@@ -4151,7 +4151,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                     },
@@ -4173,7 +4173,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
                         .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
-                        .{ .kind = .{ .umax_mem = .src0 } },
+                        .{ .kind = .{ .umax_mem = .{ .ref = .src0 } } },
                         .unused,
                         .unused,
                     },
@@ -8793,7 +8793,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 
             .abs => |air_tag| if (use_old) try cg.airAbs(inst) else fallback: {
                 const ty_op = air_datas[@intFromEnum(inst)].ty_op;
-                if (ty_op.ty.toType().scalarType(zcu).isRuntimeFloat()) break :fallback try cg.airAbs(inst);
+                if (ty_op.ty.toType().isVector(zcu) and ty_op.ty.toType().childType(zcu).isRuntimeFloat()) break :fallback try cg.airAbs(inst);
                 var ops = try cg.tempsFromOperands(inst, .{ty_op.operand});
                 var res: [1]Temp = undefined;
                 cg.select(&res, &.{ty_op.ty.toType()}, &ops, comptime &.{ .{
@@ -8991,16 +8991,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .add, .tmp0p, .si(8), ._, ._ },
                         .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .mmx, .ssse3, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .byte } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_mm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .p_b, .abs, .dst0q, .src0q, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .mmx, .ssse3, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .byte } }, .any },
@@ -9008,20 +8998,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .mem, .none } },
                         .{ .src = .{ .to_mm, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .mmx } }},
                     .each = .{ .once = &.{
                         .{ ._, .p_b, .abs, .dst0q, .src0q, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .mmx, .ssse3, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .word } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_mm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .p_w, .abs, .dst0q, .src0q, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .mmx, .ssse3, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .word } }, .any },
@@ -9029,20 +9009,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .mem, .none } },
                         .{ .src = .{ .to_mm, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .mmx } }},
                     .each = .{ .once = &.{
                         .{ ._, .p_w, .abs, .dst0q, .src0q, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .mmx, .ssse3, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .dword } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_mm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .p_d, .abs, .dst0q, .src0q, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .mmx, .ssse3, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .qword, .is = .dword } }, .any },
@@ -9050,196 +9020,106 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ .src = .{ .mem, .none } },
                         .{ .src = .{ .to_mm, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .mmx } }},
                     .each = .{ .once = &.{
                         .{ ._, .p_d, .abs, .dst0q, .src0q, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .ssse3, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .byte } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_xmm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .p_b, .abs, .dst0x, .src0x, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .ssse3, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .byte } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_xmm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .p_b, .abs, .dst0x, .src0x, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .ssse3, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .word } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_xmm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .p_w, .abs, .dst0x, .src0x, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .ssse3, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .word } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_xmm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .p_w, .abs, .dst0x, .src0x, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .ssse3, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .dword } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_xmm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .p_d, .abs, .dst0x, .src0x, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .ssse3, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .dword } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_xmm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .p_d, .abs, .dst0x, .src0x, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .byte } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_xmm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .vp_b, .abs, .dst0x, .src0x, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .byte } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_xmm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .vp_b, .abs, .dst0x, .src0x, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .word } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_xmm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .vp_w, .abs, .dst0x, .src0x, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .word } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_xmm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .vp_w, .abs, .dst0x, .src0x, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .dword } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_xmm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .vp_d, .abs, .dst0x, .src0x, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .xword, .is = .dword } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_xmm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .vp_d, .abs, .dst0x, .src0x, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx2, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .byte } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_ymm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .vp_b, .abs, .dst0y, .src0y, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx2, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .byte } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_ymm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .vp_b, .abs, .dst0y, .src0y, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx2, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .word } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_ymm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .vp_w, .abs, .dst0y, .src0y, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx2, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .word } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_ymm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .vp_w, .abs, .dst0y, .src0y, ._, ._ },
                     } },
-                }, .{
-                    .required_features = .{ .avx2, null, null, null },
-                    .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .dword } }, .any },
-                    .patterns = &.{
-                        .{ .src = .{ .mut_ymm, .none } },
-                    },
-                    .dst_temps = .{.{ .ref = .src0 }},
-                    .each = .{ .once = &.{
-                        .{ ._, .vp_d, .abs, .dst0y, .src0y, ._, ._ },
-                    } },
                 }, .{
                     .required_features = .{ .avx2, null, null, null },
                     .src_constraints = .{ .{ .scalar_int = .{ .of = .yword, .is = .dword } }, .any },
                     .patterns = &.{
                         .{ .src = .{ .mem, .none } },
-                        .{ .src = .{ .to_ymm, .none } },
+                        .{ .src = .{ .to_sse, .none } },
                     },
-                    .dst_temps = .{.{ .rc = .sse }},
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
                     .each = .{ .once = &.{
                         .{ ._, .vp_d, .abs, .dst0y, .src0y, ._, ._ },
                     } },
@@ -9807,6 +9687,266 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                         .{ ._, ._, .cmp, .tmp0d, .sa(.none, .add_src0_unaligned_size), ._, ._ },
                         .{ ._, ._b, .j, .@"0b", ._, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .xword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, ._ps, .@"and", .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, ._pd, .@"and", .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, ._ps, .@"and", .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .xword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_ps, .@"and", .dst0x, .src0x, .lea(.xword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .xword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_pd, .@"and", .dst0x, .src0x, .lea(.xword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .yword, .is = .dword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .yword, .is = .qword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_pd, .@"and", .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .x87, null, null, null },
+                    .src_constraints = .{ .{ .scalar_exact_float = .{ .of = .xword, .is = .tbyte } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_x87, .none } },
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .x87 } }},
+                    .clobbers = .{ .st = 1 },
+                    .each = .{ .once = &.{
+                        .{ ._, .f_, .ld, .src0t, ._, ._, ._ },
+                        .{ ._, .f_, .abs, ._, ._, ._, ._ },
+                        .{ ._, .f_p, .st, .dst0t, ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .scalar_float = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .@"and", .dst0x, .src0x, .lea(.xword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .scalar_float = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .@"and", .dst0x, .src0x, .lea(.xword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .{ .scalar_float = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .p_, .@"and", .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .{ .scalar_float = .{ .of = .xword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .ref = .src0 }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, ._ps, .@"and", .dst0x, .lea(.xword, .tmp0), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx2, null, null, null },
+                    .src_constraints = .{ .{ .scalar_float = .{ .of = .yword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .vp_, .@"and", .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .{ .scalar_float = .{ .of = .yword, .is = .xword } }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_sse, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .kind = .{ .smax_mem = .{ .ref = .src0, .vectorize = true } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{.{ .mut_reg = .{ .ref = .src0, .rc = .sse } }},
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ },
+                        .{ ._, .v_pd, .@"and", .dst0y, .src0y, .lea(.yword, .tmp0), ._ },
+                    } },
                 } }) catch |err| switch (err) {
                     error.SelectFailed => return cg.fail("failed to select {s} {} {}", .{
                         @tagName(air_tag),
@@ -21372,10 +21512,11 @@ fn airCmp(self: *CodeGen, inst: Air.Inst.Index, op: std.math.CompareOperator) !v
         switch (ty.zigTypeTag(zcu)) {
             .float => {
                 const float_bits = ty.floatBits(self.target.*);
-                if (switch (float_bits) {
-                    16 => !self.hasFeature(.f16c),
-                    32, 64 => false,
-                    80, 128 => true,
+                if (!switch (float_bits) {
+                    16 => self.hasFeature(.f16c),
+                    32 => self.hasFeature(.sse),
+                    64 => self.hasFeature(.sse2),
+                    80, 128 => false,
                     else => unreachable,
                 }) {
                     var callee_buf: ["__???f2".len]u8 = undefined;
@@ -23640,12 +23781,24 @@ const MoveStrategy = union(enum) {
                 assert(dst_reg != .st7);
                 try self.asmRegister(.{ .f_p, .st }, @enumFromInt(@intFromEnum(dst_reg) + 1));
             },
-            .insert_extract => |ie| try self.asmRegisterMemoryImmediate(
-                ie.insert,
-                dst_reg,
-                src_mem,
-                .u(0),
-            ),
+            .insert_extract => |ie| if (ie.insert[0] != .p_w or self.hasFeature(.sse2))
+                try self.asmRegisterMemoryImmediate(ie.insert, dst_reg, src_mem, .u(0))
+            else {
+                const tmp_frame_index = try self.allocFrameIndex(.init(.{
+                    .size = 16,
+                    .alignment = .@"16",
+                }));
+                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                try self.asmRegisterMemory(.{ ._, .movzx }, tmp_reg.to32(), src_mem);
+                try self.asmMemoryRegister(.{ ._, .mov }, .{
+                    .base = .{ .frame = tmp_frame_index },
+                    .mod = .{ .rm = .{ .size = .word } },
+                }, tmp_reg.to16());
+                try self.asmRegisterMemory(.{ ._ps, .mova }, dst_reg.to128(), .{
+                    .base = .{ .frame = tmp_frame_index },
+                    .mod = .{ .rm = .{ .size = .xword } },
+                });
+            },
             .vex_insert_extract => |ie| try self.asmRegisterRegisterMemoryImmediate(
                 ie.insert,
                 dst_reg,
@@ -23678,7 +23831,7 @@ const MoveStrategy = union(enum) {
                     .mod = .{ .rm = .{ .size = .xword } },
                 }, src_reg.to128());
                 const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-                try self.asmRegisterMemory(.{ ._, .mov }, tmp_reg.to16(), .{
+                try self.asmRegisterMemory(.{ ._, .movzx }, tmp_reg.to32(), .{
                     .base = .{ .frame = tmp_frame_index },
                     .mod = .{ .rm = .{ .size = .word } },
                 });
@@ -24122,7 +24275,18 @@ fn genSetReg(
                 ),
                 else => unreachable,
             },
-            .segment, .x87, .mmx, .sse => try self.genSetReg(dst_reg, ty, try self.genTypedValue(try pt.undefValue(ty)), opts),
+            .segment, .x87, .mmx, .sse => {
+                const full_ty = try pt.vectorType(.{
+                    .len = self.vectorSize(.float),
+                    .child = .u8_type,
+                });
+                try self.genSetReg(dst_reg, full_ty, try self.genTypedValue(
+                    .fromInterned(try pt.intern(.{ .aggregate = .{
+                        .ty = full_ty.toIntern(),
+                        .storage = .{ .repeated_elem = (try pt.intValue(.u8, 0xaa)).toIntern() },
+                    } })),
+                ), opts);
+            },
             .ip, .cr, .dr => unreachable,
         },
         .eflags => |cc| try self.asmSetccRegister(cc, dst_reg.to8()),
@@ -24171,13 +24335,19 @@ fn genSetReg(
                     registerAlias(dst_reg, @max(abi_size, 4)),
                     src_reg.to128(),
                 ) else {
+                    const frame_size = std.math.ceilPowerOfTwoAssert(u32, @max(abi_size, 4));
                     const frame_index = try self.allocFrameIndex(.init(.{
-                        .size = 4,
-                        .alignment = .@"4",
+                        .size = frame_size,
+                        .alignment = .fromNonzeroByteUnits(frame_size),
                     }));
-                    try self.asmMemoryRegister(.{ ._ss, .mov }, .{
+                    try self.asmMemoryRegister(switch (frame_size) {
+                        4 => .{ ._ss, .mov },
+                        8 => .{ ._ps, .movl },
+                        16 => .{ ._ps, .mov },
+                        else => unreachable,
+                    }, .{
                         .base = .{ .frame = frame_index },
-                        .mod = .{ .rm = .{ .size = .dword } },
+                        .mod = .{ .rm = .{ .size = .fromSize(frame_size) } },
                     }, src_reg.to128());
                     try self.asmRegisterMemory(.{ ._, .mov }, registerAlias(dst_reg, abi_size), .{
                         .base = .{ .frame = frame_index },
@@ -28572,7 +28742,7 @@ fn resolveCallingConventionValues(
                         arg_mcv_i += 1;
                     },
                     .sse, .float, .float_combine => {
-                        const param_sse_regs = abi.getCAbiSseParamRegs(cc);
+                        const param_sse_regs = abi.getCAbiSseParamRegs(cc, self.target);
                         const abi_size: u32 = @intCast(ty.abiSize(zcu));
                         const reg_size = @min(abi_size, self.vectorSize(.float));
                         var byte_offset: u32 = 0;
@@ -28670,7 +28840,7 @@ fn resolveCallingConventionValues(
 
             var param_gpr = abi.getCAbiIntParamRegs(cc);
             var param_x87 = abi.getCAbiX87ParamRegs(cc);
-            var param_sse = abi.getCAbiSseParamRegs(cc);
+            var param_sse = abi.getCAbiSseParamRegs(cc, self.target);
 
             // Return values
             result.return_value = if (ret_ty.isNoReturn(zcu))
@@ -29156,6 +29326,10 @@ fn intInfo(cg: *CodeGen, ty: Type) ?std.builtin.Type.Int {
     };
 }
 
+fn floatBits(cg: *CodeGen, ty: Type) ?u16 {
+    return if (ty.isRuntimeFloat()) ty.floatBits(cg.target.*) else null;
+}
+
 const Temp = struct {
     index: Air.Inst.Index,
 
@@ -30234,6 +30408,7 @@ const Select = struct {
         backward: ?Mir.Inst.Index,
         forward: [1]?Mir.Inst.Index,
     },
+    top: u3,
 
     fn emitLabel(s: *Select, label_index: Label) void {
         if (label_index == ._) return;
@@ -30269,6 +30444,92 @@ const Select = struct {
             },
             else => |e| return e,
         };
+        switch (mir_tag[0]) {
+            .f_ => switch (mir_tag[1]) {
+                .abs, .st => {},
+                .ld => s.top -%= 1,
+                else => {
+                    const fixes = @tagName(mir_tag[0]);
+                    const fixes_blank = std.mem.indexOfScalar(u8, fixes, '_').?;
+                    std.debug.panic("{s}: {s}{s}{s}\n", .{
+                        @src().fn_name,
+                        fixes[0..fixes_blank],
+                        @tagName(mir_tag[1]),
+                        fixes[fixes_blank + 1 ..],
+                    });
+                },
+            },
+            .f_p => switch (mir_tag[1]) {
+                .st => s.top +%= 1,
+                else => {
+                    const fixes = @tagName(mir_tag[0]);
+                    const fixes_blank = std.mem.indexOfScalar(u8, fixes, '_').?;
+                    std.debug.panic("{s}: {s}{s}{s}\n", .{
+                        @src().fn_name,
+                        fixes[0..fixes_blank],
+                        @tagName(mir_tag[1]),
+                        fixes[fixes_blank + 1 ..],
+                    });
+                },
+            },
+            .f_1,
+            => switch (mir_tag[1]) {
+                .ld => s.top -%= 1,
+                else => {
+                    const fixes = @tagName(mir_tag[0]);
+                    const fixes_blank = std.mem.indexOfScalar(u8, fixes, '_').?;
+                    std.debug.panic("{s}: {s}{s}{s}\n", .{
+                        @src().fn_name,
+                        fixes[0..fixes_blank],
+                        @tagName(mir_tag[1]),
+                        fixes[fixes_blank + 1 ..],
+                    });
+                },
+            },
+            .f_l2e,
+            .f_l2t,
+            .f_lg2,
+            .f_ln2,
+            .f_pi,
+            .f_z,
+            => switch (mir_tag[1]) {
+                .ld => s.top -%= 1,
+                else => unreachable,
+            },
+            .f_b,
+            .f_be,
+            .f_cw,
+            .f_e,
+            .f_env,
+            .f_nb,
+            .f_nbe,
+            .f_ne,
+            .f_nu,
+            .f_p1,
+            .f_pp,
+            .f_sw,
+            .f_u,
+            .fb_,
+            .fb_p,
+            .fi_,
+            .fi_p,
+            .fn_,
+            .fn_cw,
+            .fn_env,
+            .fn_sw,
+            => {},
+            .f_cstp => switch (mir_tag[1]) {
+                .de => s.top -%= 1,
+                .in => s.top +%= 1,
+                else => unreachable,
+            },
+            else => {},
+        }
+    }
+
+    fn lowerReg(s: *const Select, reg: Register) Register {
+        if (reg.class() != .x87) return reg;
+        return @enumFromInt(@intFromEnum(Register.st0) + (@as(u3, @intCast(reg.enc())) -% s.top));
     }
 
     const Case = struct {
@@ -30278,7 +30539,7 @@ const Select = struct {
         patterns: []const Select.Pattern,
         extra_temps: [@intFromEnum(Select.Operand.Ref.dst0) - @intFromEnum(Select.Operand.Ref.tmp0)]TempSpec = @splat(.unused),
         dst_temps: [@intFromEnum(Select.Operand.Ref.src0) - @intFromEnum(Select.Operand.Ref.dst0)]TempSpec.Kind = @splat(.unused),
-        clobbers: struct { eflags: bool = false } = .{},
+        clobbers: struct { eflags: bool = false, st: u3 = 0 } = .{},
         each: union(enum) {
             once: []const Instruction,
         },
@@ -30305,6 +30566,8 @@ const Select = struct {
         scalar_signed_int: Memory.Size,
         scalar_unsigned_int: Memory.Size,
         scalar_remainder_int: struct { of: Memory.Size, is: Memory.Size },
+        scalar_float: struct { of: Memory.Size, is: Memory.Size },
+        scalar_exact_float: struct { of: Memory.Size, is: Memory.Size },
         multiple_scalar_int: struct { of: Memory.Size, is: Memory.Size },
         exact_int: u16,
         exact_signed_int: u16,
@@ -30338,23 +30601,23 @@ const Select = struct {
                     size.bitSize(cg.target) >= ty.vectorLen(zcu),
                 .vec => |size| ty.isVector(zcu) and ty.scalarType(zcu).toIntern() != .bool_type and
                     size.bitSize(cg.target) >= ty.abiSize(zcu),
-                .signed_int_vec => |size| ty.isVector(zcu) and size.bitSize(cg.target) >= 8 * ty.abiSize(zcu) and
+                .signed_int_vec => |size| ty.isVector(zcu) and @divExact(size.bitSize(cg.target), 8) >= ty.abiSize(zcu) and
                     if (cg.intInfo(ty.childType(zcu))) |int_info| int_info.signedness == .signed else false,
-                .signed_int_or_full_vec => |size| ty.isVector(zcu) and size.bitSize(cg.target) >= 8 * ty.abiSize(zcu) and
+                .signed_int_or_full_vec => |size| ty.isVector(zcu) and @divExact(size.bitSize(cg.target), 8) >= ty.abiSize(zcu) and
                     if (cg.intInfo(ty.childType(zcu))) |int_info| switch (int_info.signedness) {
                     .signed => true,
                     .unsigned => int_info.bits >= 8 and std.math.isPowerOfTwo(int_info.bits),
                 } else false,
-                .unsigned_int_vec => |size| ty.isVector(zcu) and size.bitSize(cg.target) >= 8 * ty.abiSize(zcu) and
+                .unsigned_int_vec => |size| ty.isVector(zcu) and @divExact(size.bitSize(cg.target), 8) >= ty.abiSize(zcu) and
                     if (cg.intInfo(ty.childType(zcu))) |int_info| int_info.signedness == .unsigned else false,
-                .size => |size| size.bitSize(cg.target) >= 8 * ty.abiSize(zcu),
-                .multiple_size => |size| size.bitSize(cg.target) % 8 * ty.abiSize(zcu) == 0,
+                .size => |size| @divExact(size.bitSize(cg.target), 8) >= ty.abiSize(zcu),
+                .multiple_size => |size| ty.abiSize(zcu) % @divExact(size.bitSize(cg.target), 8) == 0,
                 .int => |size| if (cg.intInfo(ty)) |int_info| size.bitSize(cg.target) >= int_info.bits else false,
                 .scalar_int_is => |size| if (cg.intInfo(ty.scalarType(zcu))) |int_info|
                     size.bitSize(cg.target) >= int_info.bits
                 else
                     false,
-                .scalar_int => |of_is| of_is.of.bitSize(cg.target) >= 8 * ty.abiSize(zcu) and
+                .scalar_int => |of_is| @divExact(of_is.of.bitSize(cg.target), 8) >= ty.abiSize(zcu) and
                     if (cg.intInfo(ty.scalarType(zcu))) |int_info| of_is.is.bitSize(cg.target) >= int_info.bits else false,
                 .scalar_signed_int => |size| if (cg.intInfo(ty.scalarType(zcu))) |int_info| switch (int_info.signedness) {
                     .signed => size.bitSize(cg.target) >= int_info.bits,
@@ -30364,12 +30627,16 @@ const Select = struct {
                     .signed => false,
                     .unsigned => size.bitSize(cg.target) >= int_info.bits,
                 } else false,
-                .multiple_scalar_int => |of_is| of_is.of.bitSize(cg.target) % 8 * ty.abiSize(zcu) == 0 and
+                .multiple_scalar_int => |of_is| ty.abiSize(zcu) % @divExact(of_is.of.bitSize(cg.target), 8) == 0 and
                     if (cg.intInfo(ty.scalarType(zcu))) |int_info| of_is.is.bitSize(cg.target) >= int_info.bits else false,
                 .scalar_remainder_int => |of_is| if (cg.intInfo(ty.scalarType(zcu))) |int_info|
                     of_is.is.bitSize(cg.target) >= (int_info.bits - 1) % of_is.of.bitSize(cg.target) + 1
                 else
                     false,
+                .scalar_float => |of_is| @divExact(of_is.of.bitSize(cg.target), 8) >= ty.abiSize(zcu) and
+                    if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) >= float_bits else false,
+                .scalar_exact_float => |of_is| @divExact(of_is.of.bitSize(cg.target), 8) >= ty.abiSize(zcu) and
+                    if (cg.floatBits(ty.scalarType(zcu))) |float_bits| of_is.is.bitSize(cg.target) == float_bits else false,
                 .exact_int => |bit_size| if (cg.intInfo(ty)) |int_info| bit_size == int_info.bits else false,
                 .exact_signed_int => |bit_size| if (cg.intInfo(ty)) |int_info| switch (int_info.signedness) {
                     .signed => bit_size == int_info.bits,
@@ -30452,10 +30719,18 @@ const Select = struct {
             to_gpr,
             mut_gpr,
             to_mut_gpr,
+            x87,
+            to_x87,
+            mut_x87,
+            to_mut_x87,
             mm,
             to_mm,
             mut_mm,
             to_mut_mm,
+            sse,
+            to_sse,
+            mut_sse,
+            to_mut_sse,
             xmm,
             to_xmm,
             mut_xmm,
@@ -30499,6 +30774,17 @@ const Select = struct {
                         else => false,
                     },
                     .to_gpr, .to_mut_gpr => temp.typeOf(cg).abiSize(cg.pt.zcu) <= 8,
+                    .x87 => switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .x87,
+                        .register_offset => |reg_off| reg_off.reg.class() == .x87 and reg_off.off == 0,
+                        else => false,
+                    },
+                    .mut_x87 => temp.isMut(cg) and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .x87,
+                        .register_offset => |reg_off| reg_off.reg.class() == .x87 and reg_off.off == 0,
+                        else => false,
+                    },
+                    .to_x87, .to_mut_x87 => true,
                     .mm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 8 and switch (temp.tracking(cg).short) {
                         .register => |reg| reg.class() == .mmx,
                         .register_offset => |reg_off| reg_off.reg.class() == .mmx and reg_off.off == 0,
@@ -30510,6 +30796,17 @@ const Select = struct {
                         else => false,
                     },
                     .to_mm, .to_mut_mm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 8,
+                    .sse => switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .sse,
+                        .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
+                        else => false,
+                    },
+                    .mut_sse => temp.isMut(cg) and switch (temp.tracking(cg).short) {
+                        .register => |reg| reg.class() == .sse,
+                        .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
+                        else => false,
+                    },
+                    .to_sse, .to_mut_sse => true,
                     .xmm => temp.typeOf(cg).abiSize(cg.pt.zcu) == 16 and switch (temp.tracking(cg).short) {
                         .register => |reg| reg.class() == .sse,
                         .register_offset => |reg_off| reg_off.reg.class() == .sse and reg_off.off == 0,
@@ -30542,10 +30839,12 @@ const Select = struct {
                     .mem, .to_mem, .mut_mem, .to_mut_mem => try temp.toBase(cg),
                     .gpr, .to_gpr => try temp.toRegClass(false, .general_purpose, cg),
                     .mut_gpr, .to_mut_gpr => try temp.toRegClass(true, .general_purpose, cg),
+                    .x87, .to_x87 => try temp.toRegClass(false, .x87, cg),
+                    .mut_x87, .to_mut_x87 => try temp.toRegClass(true, .x87, cg),
                     .mm, .to_mm => try temp.toRegClass(false, .mmx, cg),
                     .mut_mm, .to_mut_mm => try temp.toRegClass(true, .mmx, cg),
-                    .xmm, .to_xmm, .ymm, .to_ymm => try temp.toRegClass(false, .sse, cg),
-                    .mut_xmm, .to_mut_xmm, .mut_ymm, .to_mut_ymm => try temp.toRegClass(true, .sse, cg),
+                    .sse, .to_sse, .xmm, .to_xmm, .ymm, .to_ymm => try temp.toRegClass(false, .sse, cg),
+                    .mut_sse, .to_mut_sse, .mut_xmm, .to_mut_xmm, .mut_ymm, .to_mut_ymm => try temp.toRegClass(true, .sse, cg),
                 };
             }
         };
@@ -30565,12 +30864,15 @@ const Select = struct {
             rc: Register.Class,
             rc_mask: struct { rc: Register.Class, info: MaskInfo },
             mem,
-            smin_mem: Select.Operand.Ref,
-            smax_mem: Select.Operand.Ref,
-            umin_mem: Select.Operand.Ref,
-            umax_mem: Select.Operand.Ref,
+            smin_mem: ConstInfo,
+            smax_mem: ConstInfo,
+            umin_mem: ConstInfo,
+            umax_mem: ConstInfo,
             ref: Select.Operand.Ref,
             ref_mask: struct { ref: Select.Operand.Ref, info: MaskInfo },
+            mut_reg: struct { ref: Select.Operand.Ref, rc: Register.Class },
+
+            const ConstInfo = struct { ref: Select.Operand.Ref, vectorize: bool = false };
 
             fn finish(kind: Kind, temp: Temp, s: *const Select) void {
                 switch (kind) {
@@ -30590,13 +30892,13 @@ const Select = struct {
                 .rc => |rc| try cg.tempAllocReg(spec.type, regSetForRegClass(rc)),
                 .rc_mask => |rc_mask| try cg.tempAllocReg(spec.type, regSetForRegClass(rc_mask.rc)),
                 .mem => try cg.tempAllocMem(spec.type),
-                .smin_mem, .smax_mem, .umin_mem, .umax_mem => |ty_ref| {
+                .smin_mem, .smax_mem, .umin_mem, .umax_mem => |const_info| {
                     const pt = cg.pt;
                     const zcu = pt.zcu;
                     const ip = &zcu.intern_pool;
-                    const ty = ty_ref.deref(s).typeOf(s.cg);
-                    const vector_len, const scalar_ty: Type = switch (ip.indexToKey(ty.toIntern())) {
-                        else => .{ null, ty },
+                    const ty = const_info.ref.deref(s).typeOf(s.cg);
+                    const vector_len: ?u32, const scalar_ty: Type = switch (ip.indexToKey(ty.toIntern())) {
+                        else => .{ if (const_info.vectorize) 1 else null, ty },
                         .vector_type => |vector_type| .{ vector_type.len, .fromInterned(vector_type.child) },
                     };
                     const res_scalar_ty, const res_scalar_val: Value = res_scalar: switch (scalar_ty.toIntern()) {
@@ -30609,7 +30911,10 @@ const Select = struct {
                             }),
                         },
                         else => {
-                            const scalar_info = cg.intInfo(scalar_ty).?;
+                            const scalar_info: std.builtin.Type.Int = cg.intInfo(scalar_ty) orelse .{
+                                .signedness = .signed,
+                                .bits = cg.floatBits(scalar_ty).?,
+                            };
                             const scalar_int_ty = try pt.intType(scalar_info.signedness, scalar_info.bits);
                             if (scalar_info.bits <= 64) {
                                 const int_val: i64 = switch (spec.kind) {
@@ -30651,6 +30956,15 @@ const Select = struct {
                 },
                 .ref => |ref| ref.deref(s),
                 .ref_mask => |ref_mask| ref_mask.ref.deref(s),
+                .mut_reg => |ref_rc| {
+                    const temp = ref_rc.ref.deref(s);
+                    if (temp.isMut(cg)) switch (temp.tracking(cg).short) {
+                        .register => |reg| if (reg.class() == ref_rc.rc) return temp,
+                        .register_offset => |reg_off| if (reg_off.off == 0 and reg_off.reg.class() == ref_rc.rc) return temp,
+                        else => {},
+                    };
+                    return try cg.tempAllocReg(spec.type, regSetForRegClass(ref_rc.rc));
+                },
             };
         }
     };
@@ -30759,6 +31073,7 @@ const Select = struct {
                 const tmp0d: Sized = .{ .ref = .tmp0, .size = .dword };
                 const tmp0p: Sized = .{ .ref = .tmp0, .size = .ptr };
                 const tmp0q: Sized = .{ .ref = .tmp0, .size = .qword };
+                const tmp0t: Sized = .{ .ref = .tmp0, .size = .tbyte };
                 const tmp0x: Sized = .{ .ref = .tmp0, .size = .xword };
                 const tmp0y: Sized = .{ .ref = .tmp0, .size = .yword };
 
@@ -30768,6 +31083,7 @@ const Select = struct {
                 const tmp1d: Sized = .{ .ref = .tmp1, .size = .dword };
                 const tmp1p: Sized = .{ .ref = .tmp1, .size = .ptr };
                 const tmp1q: Sized = .{ .ref = .tmp1, .size = .qword };
+                const tmp1t: Sized = .{ .ref = .tmp1, .size = .tbyte };
                 const tmp1x: Sized = .{ .ref = .tmp1, .size = .xword };
                 const tmp1y: Sized = .{ .ref = .tmp1, .size = .yword };
 
@@ -30777,6 +31093,7 @@ const Select = struct {
                 const tmp2d: Sized = .{ .ref = .tmp2, .size = .dword };
                 const tmp2p: Sized = .{ .ref = .tmp2, .size = .ptr };
                 const tmp2q: Sized = .{ .ref = .tmp2, .size = .qword };
+                const tmp2t: Sized = .{ .ref = .tmp2, .size = .tbyte };
                 const tmp2x: Sized = .{ .ref = .tmp2, .size = .xword };
                 const tmp2y: Sized = .{ .ref = .tmp2, .size = .yword };
 
@@ -30786,6 +31103,7 @@ const Select = struct {
                 const tmp3d: Sized = .{ .ref = .tmp3, .size = .dword };
                 const tmp3p: Sized = .{ .ref = .tmp3, .size = .ptr };
                 const tmp3q: Sized = .{ .ref = .tmp3, .size = .qword };
+                const tmp3t: Sized = .{ .ref = .tmp3, .size = .tbyte };
                 const tmp3x: Sized = .{ .ref = .tmp3, .size = .xword };
                 const tmp3y: Sized = .{ .ref = .tmp3, .size = .yword };
 
@@ -30795,6 +31113,7 @@ const Select = struct {
                 const tmp4d: Sized = .{ .ref = .tmp4, .size = .dword };
                 const tmp4p: Sized = .{ .ref = .tmp4, .size = .ptr };
                 const tmp4q: Sized = .{ .ref = .tmp4, .size = .qword };
+                const tmp4t: Sized = .{ .ref = .tmp4, .size = .tbyte };
                 const tmp4x: Sized = .{ .ref = .tmp4, .size = .xword };
                 const tmp4y: Sized = .{ .ref = .tmp4, .size = .yword };
 
@@ -30804,6 +31123,7 @@ const Select = struct {
                 const tmp5d: Sized = .{ .ref = .tmp5, .size = .dword };
                 const tmp5p: Sized = .{ .ref = .tmp5, .size = .ptr };
                 const tmp5q: Sized = .{ .ref = .tmp5, .size = .qword };
+                const tmp5t: Sized = .{ .ref = .tmp5, .size = .tbyte };
                 const tmp5x: Sized = .{ .ref = .tmp5, .size = .xword };
                 const tmp5y: Sized = .{ .ref = .tmp5, .size = .yword };
 
@@ -30813,6 +31133,7 @@ const Select = struct {
                 const dst0d: Sized = .{ .ref = .dst0, .size = .dword };
                 const dst0p: Sized = .{ .ref = .dst0, .size = .ptr };
                 const dst0q: Sized = .{ .ref = .dst0, .size = .qword };
+                const dst0t: Sized = .{ .ref = .dst0, .size = .tbyte };
                 const dst0x: Sized = .{ .ref = .dst0, .size = .xword };
                 const dst0y: Sized = .{ .ref = .dst0, .size = .yword };
 
@@ -30822,6 +31143,7 @@ const Select = struct {
                 const src0d: Sized = .{ .ref = .src0, .size = .dword };
                 const src0p: Sized = .{ .ref = .src0, .size = .ptr };
                 const src0q: Sized = .{ .ref = .src0, .size = .qword };
+                const src0t: Sized = .{ .ref = .src0, .size = .tbyte };
                 const src0x: Sized = .{ .ref = .src0, .size = .xword };
                 const src0y: Sized = .{ .ref = .src0, .size = .yword };
 
@@ -30831,6 +31153,7 @@ const Select = struct {
                 const src1d: Sized = .{ .ref = .src1, .size = .dword };
                 const src1p: Sized = .{ .ref = .src1, .size = .ptr };
                 const src1q: Sized = .{ .ref = .src1, .size = .qword };
+                const src1t: Sized = .{ .ref = .src1, .size = .tbyte };
                 const src1x: Sized = .{ .ref = .src1, .size = .xword };
                 const src1y: Sized = .{ .ref = .src1, .size = .yword };
             };
@@ -30852,6 +31175,7 @@ const Select = struct {
         const tmp0d: Select.Operand = .{ .tag = .ref, .base = .tmp0d };
         const tmp0p: Select.Operand = .{ .tag = .ref, .base = .tmp0p };
         const tmp0q: Select.Operand = .{ .tag = .ref, .base = .tmp0q };
+        const tmp0t: Select.Operand = .{ .tag = .ref, .base = .tmp0t };
         const tmp0x: Select.Operand = .{ .tag = .ref, .base = .tmp0x };
         const tmp0y: Select.Operand = .{ .tag = .ref, .base = .tmp0y };
 
@@ -30860,6 +31184,7 @@ const Select = struct {
         const tmp1d: Select.Operand = .{ .tag = .ref, .base = .tmp1d };
         const tmp1p: Select.Operand = .{ .tag = .ref, .base = .tmp1p };
         const tmp1q: Select.Operand = .{ .tag = .ref, .base = .tmp1q };
+        const tmp1t: Select.Operand = .{ .tag = .ref, .base = .tmp1t };
         const tmp1x: Select.Operand = .{ .tag = .ref, .base = .tmp1x };
         const tmp1y: Select.Operand = .{ .tag = .ref, .base = .tmp1y };
 
@@ -30868,6 +31193,7 @@ const Select = struct {
         const tmp2d: Select.Operand = .{ .tag = .ref, .base = .tmp2d };
         const tmp2p: Select.Operand = .{ .tag = .ref, .base = .tmp2p };
         const tmp2q: Select.Operand = .{ .tag = .ref, .base = .tmp2q };
+        const tmp2t: Select.Operand = .{ .tag = .ref, .base = .tmp2t };
         const tmp2x: Select.Operand = .{ .tag = .ref, .base = .tmp2x };
         const tmp2y: Select.Operand = .{ .tag = .ref, .base = .tmp2y };
 
@@ -30876,6 +31202,7 @@ const Select = struct {
         const tmp3d: Select.Operand = .{ .tag = .ref, .base = .tmp3d };
         const tmp3p: Select.Operand = .{ .tag = .ref, .base = .tmp3p };
         const tmp3q: Select.Operand = .{ .tag = .ref, .base = .tmp3q };
+        const tmp3t: Select.Operand = .{ .tag = .ref, .base = .tmp3t };
         const tmp3x: Select.Operand = .{ .tag = .ref, .base = .tmp3x };
         const tmp3y: Select.Operand = .{ .tag = .ref, .base = .tmp3y };
 
@@ -30884,6 +31211,7 @@ const Select = struct {
         const tmp4d: Select.Operand = .{ .tag = .ref, .base = .tmp4d };
         const tmp4p: Select.Operand = .{ .tag = .ref, .base = .tmp4p };
         const tmp4q: Select.Operand = .{ .tag = .ref, .base = .tmp4q };
+        const tmp4t: Select.Operand = .{ .tag = .ref, .base = .tmp4t };
         const tmp4x: Select.Operand = .{ .tag = .ref, .base = .tmp4x };
         const tmp4y: Select.Operand = .{ .tag = .ref, .base = .tmp4y };
 
@@ -30892,6 +31220,7 @@ const Select = struct {
         const tmp5d: Select.Operand = .{ .tag = .ref, .base = .tmp5d };
         const tmp5p: Select.Operand = .{ .tag = .ref, .base = .tmp5p };
         const tmp5q: Select.Operand = .{ .tag = .ref, .base = .tmp5q };
+        const tmp5t: Select.Operand = .{ .tag = .ref, .base = .tmp5t };
         const tmp5x: Select.Operand = .{ .tag = .ref, .base = .tmp5x };
         const tmp5y: Select.Operand = .{ .tag = .ref, .base = .tmp5y };
 
@@ -30900,6 +31229,7 @@ const Select = struct {
         const dst0d: Select.Operand = .{ .tag = .ref, .base = .dst0d };
         const dst0p: Select.Operand = .{ .tag = .ref, .base = .dst0p };
         const dst0q: Select.Operand = .{ .tag = .ref, .base = .dst0q };
+        const dst0t: Select.Operand = .{ .tag = .ref, .base = .dst0t };
         const dst0x: Select.Operand = .{ .tag = .ref, .base = .dst0x };
         const dst0y: Select.Operand = .{ .tag = .ref, .base = .dst0y };
 
@@ -30908,6 +31238,7 @@ const Select = struct {
         const src0d: Select.Operand = .{ .tag = .ref, .base = .src0d };
         const src0p: Select.Operand = .{ .tag = .ref, .base = .src0p };
         const src0q: Select.Operand = .{ .tag = .ref, .base = .src0q };
+        const src0t: Select.Operand = .{ .tag = .ref, .base = .src0t };
         const src0x: Select.Operand = .{ .tag = .ref, .base = .src0x };
         const src0y: Select.Operand = .{ .tag = .ref, .base = .src0y };
 
@@ -30916,6 +31247,7 @@ const Select = struct {
         const src1d: Select.Operand = .{ .tag = .ref, .base = .src1d };
         const src1p: Select.Operand = .{ .tag = .ref, .base = .src1p };
         const src1q: Select.Operand = .{ .tag = .ref, .base = .src1q };
+        const src1t: Select.Operand = .{ .tag = .ref, .base = .src1t };
         const src1x: Select.Operand = .{ .tag = .ref, .base = .src1x };
         const src1y: Select.Operand = .{ .tag = .ref, .base = .src1y };
 
@@ -31150,7 +31482,7 @@ const Select = struct {
                         else => unreachable,
                     } },
                     else => |mcv| .{ .mem = try mcv.mem(s.cg, .{ .size = op.base.size }) },
-                    .register => |reg| .{ .reg = registerAlias(reg, @intCast(@divExact(op.base.size.bitSize(s.cg.target), 8))) },
+                    .register => |reg| .{ .reg = s.lowerReg(registerAlias(reg, @intCast(@divExact(op.base.size.bitSize(s.cg.target), 8)))) },
                 },
                 .simm => .{ .imm = .s(op.adjustedImm(i32, s)) },
                 .uimm => .{ .imm = .u(@bitCast(op.adjustedImm(i64, s))) },
@@ -31202,11 +31534,18 @@ fn select(
                 .cg = cg,
                 .temps = undefined,
                 .labels = @splat(.{ .forward = @splat(null), .backward = null }),
+                .top = 0,
             };
             const tmp_slots = s.temps[@intFromEnum(Select.Operand.Ref.tmp0)..@intFromEnum(Select.Operand.Ref.dst0)];
             const dst_slots = s.temps[@intFromEnum(Select.Operand.Ref.dst0)..@intFromEnum(Select.Operand.Ref.src0)];
             const src_slots = s.temps[@intFromEnum(Select.Operand.Ref.src0)..@intFromEnum(Select.Operand.Ref.none)];
 
+            for (0..case.clobbers.st -| 1) |i| {
+                const tracked_index: RegisterManager.TrackedIndex = @intCast(RegisterManager.indexOfKnownRegIntoTracked(.st6).? - i);
+                try cg.register_manager.getRegIndex(tracked_index, null);
+                _ = cg.register_manager.lockRegIndexAssumeUnused(tracked_index);
+            }
+
             @memcpy(src_slots[0..src_temps.len], src_temps);
             std.mem.swap(Temp, &src_slots[pattern.commute[0]], &src_slots[pattern.commute[1]]);
             for (tmp_slots, case.extra_temps) |*slot, spec| slot.* = try spec.create(&s) orelse continue;
@@ -31217,7 +31556,7 @@ fn select(
             @memcpy(src_slots[0..src_temps.len], src_temps);
             std.mem.swap(Temp, &src_slots[pattern.commute[0]], &src_slots[pattern.commute[1]]);
 
-            if (case.clobbers.eflags or case.each != .once) try cg.spillEflagsIfOccupied();
+            if (case.clobbers.eflags) try cg.spillEflagsIfOccupied();
 
             for (dst_temps, dst_tys, case.dst_temps[0..dst_temps.len]) |*dst_temp, dst_ty, dst_kind|
                 dst_temp.* = (try Select.TempSpec.create(.{ .type = dst_ty, .kind = dst_kind }, &s)).?;
@@ -31229,7 +31568,11 @@ fn select(
                     s.emitLabel(.@"0:");
                 },
             }
+            assert(s.top == 0);
 
+            for (0..case.clobbers.st -| 1) |i| cg.register_manager.unlockReg(.{
+                .tracked_index = @intCast(RegisterManager.indexOfKnownRegIntoTracked(.st6).? - i),
+            });
             for (dst_temps, case.dst_temps[0..dst_temps.len]) |dst_temp, dst_kind| dst_kind.finish(dst_temp, &s);
             for (case.extra_temps, tmp_slots) |spec, temp| if (spec.kind != .unused) try temp.die(cg);
             return;
src/arch/x86_64/Mir.zig
@@ -330,8 +330,8 @@ pub const Inst = struct {
         f_pi,
         /// Float ___ Pop Pop
         f_pp,
-        /// Float ___ stack-top pointer
-        f_stp,
+        /// Float ___ crement Stack-Top Pointer
+        f_cstp,
         /// Float ___ Status Word
         f_sw,
         /// Float ___ Unordered
@@ -555,6 +555,7 @@ pub const Inst = struct {
         /// Decimal adjust AL after subtraction
         da,
         /// Decrement by 1
+        /// Decrement stack-top pointer
         /// Decrement shadow stack pointer
         de,
         /// Unsigned division
@@ -587,6 +588,7 @@ pub const Inst = struct {
         /// Input from port
         /// Input from port to string
         /// Increment by 1
+        /// Increment stack-top pointer
         /// Increment shadow stack pointer
         in,
         /// Call to interrupt procedure
@@ -792,14 +794,10 @@ pub const Inst = struct {
         comi,
         /// Cosine
         cos,
-        /// Decrement stack-top pointer
-        decstp,
         /// Reverse divide
         divr,
         /// Free floating-point register
         free,
-        /// Increment stack-top pointer
-        incstp,
         /// Initialize floating-point unit
         init,
         /// Load binary coded decimal integer
test/behavior/x86_64/math.zig
@@ -1,22 +1,136 @@
+const builtin = @import("builtin");
+const inf = math.inf;
+const math = std.math;
+const max = math.floatMax;
+const min = math.floatMin;
+const nan = math.nan;
+const std = @import("std");
+const trueMin = math.floatTrueMin;
+
+const Gpr = switch (builtin.cpu.arch) {
+    else => unreachable,
+    .x86 => u32,
+    .x86_64 => u64,
+};
+const Sse = if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx))
+    @Vector(32, u8)
+else
+    @Vector(16, u8);
+
+inline fn sign(rhs: anytype) bool {
+    return @call(.always_inline, math.signbit, .{rhs});
+}
+inline fn boolAnd(lhs: anytype, rhs: @TypeOf(lhs)) @TypeOf(lhs) {
+    switch (@typeInfo(@TypeOf(lhs))) {
+        .bool => return lhs and rhs,
+        .vector => |vector| switch (vector.child) {
+            bool => {
+                const Bits = @Vector(vector.len, u1);
+                const lhs_bits: Bits = @bitCast(lhs);
+                const rhs_bits: Bits = @bitCast(rhs);
+                return @bitCast(lhs_bits & rhs_bits);
+            },
+            else => {},
+        },
+        else => {},
+    }
+    @compileError("unsupported boolAnd type: " ++ @typeName(@TypeOf(lhs)));
+}
+inline fn boolOr(lhs: anytype, rhs: @TypeOf(lhs)) @TypeOf(lhs) {
+    switch (@typeInfo(@TypeOf(lhs))) {
+        .bool => return lhs or rhs,
+        .vector => |vector| switch (vector.child) {
+            bool => {
+                const Bits = @Vector(vector.len, u1);
+                const lhs_bits: Bits = @bitCast(lhs);
+                const rhs_bits: Bits = @bitCast(rhs);
+                return @bitCast(lhs_bits | rhs_bits);
+            },
+            else => {},
+        },
+        else => {},
+    }
+    @compileError("unsupported boolOr type: " ++ @typeName(@TypeOf(lhs)));
+}
+
+// noinline for a more helpful stack trace
+noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected)) !void {
+    const info = @typeInfo(@TypeOf(expected));
+    const unexpected = switch (switch (info) {
+        else => info,
+        .vector => |vector| @typeInfo(vector.child),
+    }) {
+        else => expected != actual,
+        .float => boolOr(boolAnd(expected != actual, boolOr(expected == expected, actual == actual)), sign(expected) != sign(actual)),
+    };
+    if (switch (info) {
+        else => unexpected,
+        .vector => @reduce(.Or, unexpected),
+    }) return error.Unexpected;
+}
+test checkExpected {
+    if (checkExpected(nan(f32), nan(f32)) == error.Unexpected) return error.Unexpected;
+    if (checkExpected(nan(f32), -nan(f32)) != error.Unexpected) return error.Unexpected;
+    if (checkExpected(@as(f32, 0.0), @as(f32, 0.0)) == error.Unexpected) return error.Unexpected;
+    if (checkExpected(@as(f32, -0.0), @as(f32, -0.0)) == error.Unexpected) return error.Unexpected;
+    if (checkExpected(@as(f32, -0.0), @as(f32, 0.0)) != error.Unexpected) return error.Unexpected;
+    if (checkExpected(@as(f32, 0.0), @as(f32, -0.0)) != error.Unexpected) return error.Unexpected;
+}
+
 fn Unary(comptime op: anytype) type {
     return struct {
-        fn testArgs(comptime Type: type, comptime imm_arg: Type) !void {
-            const expected = op(Type, imm_arg);
-            try struct {
-                fn checkExpected(actual: @TypeOf(expected)) !void {
-                    if (switch (@typeInfo(@TypeOf(expected))) {
-                        else => actual != expected,
-                        .vector => @reduce(.Or, actual != expected),
-                    }) return error.Unexpected;
-                }
-                noinline fn testArgKinds(mem_arg: Type) !void {
-                    var reg_arg = mem_arg;
-                    _ = .{&reg_arg};
-                    try checkExpected(op(Type, reg_arg));
-                    try checkExpected(op(Type, mem_arg));
-                    try checkExpected(op(Type, imm_arg));
-                }
-            }.testArgKinds(imm_arg);
+        // noinline so that `mem_arg` is on the stack
+        noinline fn testArgKinds(
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            comptime Type: type,
+            comptime imm_arg: Type,
+            mem_arg: Type,
+        ) !void {
+            const expected = comptime op(Type, imm_arg);
+            var reg_arg = mem_arg;
+            _ = .{&reg_arg};
+            try checkExpected(expected, op(Type, reg_arg));
+            try checkExpected(expected, op(Type, mem_arg));
+            try checkExpected(expected, op(Type, imm_arg));
+        }
+        // noinline for a more helpful stack trace
+        noinline fn testArgs(comptime Type: type, comptime imm_arg: Type) !void {
+            try testArgKinds(
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                Type,
+                imm_arg,
+                imm_arg,
+            );
         }
         fn testIntTypes() !void {
             try testArgs(i1, -1);
@@ -381,6 +495,102 @@ fn Unary(comptime op: anytype) type {
             try testArgs(u1025, 1 << 1023);
             try testArgs(u1025, 1 << 1024);
         }
+        fn testFloatTypes() !void {
+            try testArgs(f16, -nan(f16));
+            try testArgs(f16, -inf(f16));
+            try testArgs(f16, -max(f16));
+            try testArgs(f16, -10.0);
+            try testArgs(f16, -1.0);
+            try testArgs(f16, -0.1);
+            try testArgs(f16, -min(f16));
+            try testArgs(f16, -trueMin(f16));
+            try testArgs(f16, -0.0);
+            try testArgs(f16, 0.0);
+            try testArgs(f16, trueMin(f16));
+            try testArgs(f16, min(f16));
+            try testArgs(f16, 0.1);
+            try testArgs(f16, 1.0);
+            try testArgs(f16, 10.0);
+            try testArgs(f16, max(f16));
+            try testArgs(f16, inf(f16));
+            try testArgs(f16, nan(f16));
+
+            try testArgs(f32, -nan(f32));
+            try testArgs(f32, -inf(f32));
+            try testArgs(f32, -max(f32));
+            try testArgs(f32, -10.0);
+            try testArgs(f32, -1.0);
+            try testArgs(f32, -0.1);
+            try testArgs(f32, -min(f32));
+            try testArgs(f32, -trueMin(f32));
+            try testArgs(f32, -0.0);
+            try testArgs(f32, 0.0);
+            try testArgs(f32, trueMin(f32));
+            try testArgs(f32, min(f32));
+            try testArgs(f32, 0.1);
+            try testArgs(f32, 1.0);
+            try testArgs(f32, 10.0);
+            try testArgs(f32, max(f32));
+            try testArgs(f32, inf(f32));
+            try testArgs(f32, nan(f32));
+
+            try testArgs(f64, -nan(f64));
+            try testArgs(f64, -inf(f64));
+            try testArgs(f64, -max(f64));
+            try testArgs(f64, -10.0);
+            try testArgs(f64, -1.0);
+            try testArgs(f64, -0.1);
+            try testArgs(f64, -min(f64));
+            try testArgs(f64, -trueMin(f64));
+            try testArgs(f64, -0.0);
+            try testArgs(f64, 0.0);
+            try testArgs(f64, trueMin(f64));
+            try testArgs(f64, min(f64));
+            try testArgs(f64, 0.1);
+            try testArgs(f64, 1.0);
+            try testArgs(f64, 10.0);
+            try testArgs(f64, max(f64));
+            try testArgs(f64, inf(f64));
+            try testArgs(f64, nan(f64));
+
+            try testArgs(f80, -nan(f80));
+            try testArgs(f80, -inf(f80));
+            try testArgs(f80, -max(f80));
+            try testArgs(f80, -10.0);
+            try testArgs(f80, -1.0);
+            try testArgs(f80, -0.1);
+            try testArgs(f80, -min(f80));
+            try testArgs(f80, -trueMin(f80));
+            try testArgs(f80, -0.0);
+            try testArgs(f80, 0.0);
+            try testArgs(f80, trueMin(f80));
+            try testArgs(f80, min(f80));
+            try testArgs(f80, 0.1);
+            try testArgs(f80, 1.0);
+            try testArgs(f80, 10.0);
+            try testArgs(f80, max(f80));
+            try testArgs(f80, inf(f80));
+            try testArgs(f80, nan(f80));
+
+            try testArgs(f128, -nan(f128));
+            try testArgs(f128, -inf(f128));
+            try testArgs(f128, -max(f128));
+            try testArgs(f128, -10.0);
+            try testArgs(f128, -1.0);
+            try testArgs(f128, -0.1);
+            try testArgs(f128, -min(f128));
+            try testArgs(f128, -trueMin(f128));
+            try testArgs(f128, -0.0);
+            try testArgs(f128, 0.0);
+            try testArgs(f128, trueMin(f128));
+            try testArgs(f128, min(f128));
+            try testArgs(f128, 0.1);
+            try testArgs(f128, 1.0);
+            try testArgs(f128, 10.0);
+            try testArgs(f128, max(f128));
+            try testArgs(f128, inf(f128));
+            try testArgs(f128, nan(f128));
+        }
         fn testIntVectorTypes() !void {
             try testArgs(@Vector(3, i1), .{ -1 << 0, -1, 0 });
             try testArgs(@Vector(3, u1), .{ 0, 1, 1 << 0 });
@@ -931,29 +1141,68 @@ fn Unary(comptime op: anytype) type {
 
 fn Binary(comptime op: anytype) type {
     return struct {
-        fn testArgs(comptime Type: type, comptime imm_lhs: Type, comptime imm_rhs: Type) !void {
-            const expected = op(Type, imm_lhs, imm_rhs);
-            try struct {
-                fn checkExpected(actual: @TypeOf(expected)) !void {
-                    if (switch (@typeInfo(@TypeOf(expected))) {
-                        else => actual != expected,
-                        .vector => @reduce(.Or, actual != expected),
-                    }) return error.Unexpected;
-                }
-                noinline fn testArgKinds(mem_lhs: Type, mem_rhs: Type) !void {
-                    var reg_lhs = mem_lhs;
-                    var reg_rhs = mem_rhs;
-                    _ = .{ &reg_lhs, &reg_rhs };
-                    try checkExpected(op(Type, reg_lhs, reg_rhs));
-                    try checkExpected(op(Type, reg_lhs, mem_rhs));
-                    try checkExpected(op(Type, reg_lhs, imm_rhs));
-                    try checkExpected(op(Type, mem_lhs, reg_rhs));
-                    try checkExpected(op(Type, mem_lhs, mem_rhs));
-                    try checkExpected(op(Type, mem_lhs, imm_rhs));
-                    try checkExpected(op(Type, imm_lhs, reg_rhs));
-                    try checkExpected(op(Type, imm_lhs, mem_rhs));
-                }
-            }.testArgKinds(imm_lhs, imm_rhs);
+        // noinline so that `mem_lhs` and `mem_rhs` are on the stack
+        noinline fn testArgKinds(
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Gpr,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            _: Sse,
+            comptime Type: type,
+            comptime imm_lhs: Type,
+            mem_lhs: Type,
+            comptime imm_rhs: Type,
+            mem_rhs: Type,
+        ) !void {
+            const expected = comptime op(Type, imm_lhs, imm_rhs);
+            var reg_lhs = mem_lhs;
+            var reg_rhs = mem_rhs;
+            _ = .{ &reg_lhs, &reg_rhs };
+            try checkExpected(expected, op(Type, reg_lhs, reg_rhs));
+            try checkExpected(expected, op(Type, reg_lhs, mem_rhs));
+            try checkExpected(expected, op(Type, reg_lhs, imm_rhs));
+            try checkExpected(expected, op(Type, mem_lhs, reg_rhs));
+            try checkExpected(expected, op(Type, mem_lhs, mem_rhs));
+            try checkExpected(expected, op(Type, mem_lhs, imm_rhs));
+            try checkExpected(expected, op(Type, imm_lhs, reg_rhs));
+            try checkExpected(expected, op(Type, imm_lhs, mem_rhs));
+        }
+        // noinline for a more helpful stack trace
+        noinline fn testArgs(comptime Type: type, comptime imm_lhs: Type, comptime imm_rhs: Type) !void {
+            try testArgKinds(
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                undefined,
+                Type,
+                imm_lhs,
+                imm_lhs,
+                imm_rhs,
+                imm_rhs,
+            );
         }
         fn testIntTypes() !void {
             try testArgs(u8, 0xbb, 0x43);
@@ -1308,6 +1557,7 @@ inline fn abs(comptime Type: type, rhs: Type) @TypeOf(@abs(rhs)) {
 test abs {
     try Unary(abs).testIntTypes();
     try Unary(abs).testIntVectorTypes();
+    try Unary(abs).testFloatTypes();
 }
 
 inline fn clz(comptime Type: type, rhs: Type) @TypeOf(@clz(rhs)) {