Commit 2361468e23

Jacob Young <jacobly0@users.noreply.github.com>
2025-03-13 03:09:46
x86_64: rewrite scalar shifts
1 parent aff2be0
Changed files (5)
src
test
behavior
src/arch/x86_64/bits.zig
@@ -384,6 +384,7 @@ pub const Register = enum(u8) {
 
     pub const Class = enum {
         general_purpose,
+        gphi,
         segment,
         x87,
         mmx,
@@ -400,7 +401,7 @@ pub const Register = enum(u8) {
             @intFromEnum(Register.eax)  ... @intFromEnum(Register.r15d)  => .general_purpose,
             @intFromEnum(Register.ax)   ... @intFromEnum(Register.r15w)  => .general_purpose,
             @intFromEnum(Register.al)   ... @intFromEnum(Register.r15b)  => .general_purpose,
-            @intFromEnum(Register.ah)   ... @intFromEnum(Register.bh)    => .general_purpose,
+            @intFromEnum(Register.ah)   ... @intFromEnum(Register.bh)    => .gphi,
 
             @intFromEnum(Register.ymm0) ... @intFromEnum(Register.ymm15) => .sse,
             @intFromEnum(Register.xmm0) ... @intFromEnum(Register.xmm15) => .sse,
@@ -525,7 +526,6 @@ pub const Register = enum(u8) {
     }
 
     fn gpBase(reg: Register) u7 {
-        assert(reg.class() == .general_purpose);
         return switch (@intFromEnum(reg)) {
             // zig fmt: off
             @intFromEnum(Register.rax)  ... @intFromEnum(Register.r15)   => @intFromEnum(Register.rax),
@@ -577,7 +577,7 @@ pub const Register = enum(u8) {
     /// DWARF register encoding
     pub fn dwarfNum(reg: Register) u6 {
         return switch (reg.class()) {
-            .general_purpose => if (reg.isExtended())
+            .general_purpose, .gphi => if (reg.isExtended())
                 reg.enc()
             else
                 @as(u3, @truncate(@as(u24, 0o54673120) >> @as(u5, reg.enc()) * 3)),
src/arch/x86_64/CodeGen.zig
@@ -2418,7 +2418,7 @@ fn genBodyBlock(self: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 }
 
 fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
-    @setEvalBranchQuota(13_800);
+    @setEvalBranchQuota(13_900);
     const pt = cg.pt;
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
@@ -2454,9 +2454,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
         try cg.inst_tracking.ensureUnusedCapacity(cg.gpa, 1);
         switch (air_tags[@intFromEnum(inst)]) {
             // zig fmt: off
-            .shr, .shr_exact => try cg.airShlShrBinOp(inst),
-            .shl, .shl_exact => try cg.airShlShrBinOp(inst),
-
             .add_sat         => try cg.airAddSat(inst),
             .sub_sat         => try cg.airSubSat(inst),
             .mul_sat         => try cg.airMulSat(inst),
@@ -28416,6 +28413,947 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 };
                 try res[0].finish(inst, &.{ bin_op.lhs, bin_op.rhs }, &ops, cg);
             },
+            .shr, .shr_exact => |air_tag| if (use_old) try cg.airShlShrBinOp(inst) else fallback: {
+                const bin_op = air_datas[@intFromEnum(inst)].bin_op;
+                if (cg.typeOf(bin_op.lhs).isVector(zcu)) break :fallback try cg.airShlShrBinOp(inst);
+                var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
+                var res: [1]Temp = undefined;
+                cg.select(&res, &.{cg.typeOf(bin_op.lhs)}, &ops, comptime &.{ .{
+                    .src_constraints = .{ .{ .signed_int = .byte }, .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sa, .dst0b, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sh, .dst0b, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .word }, .{ .exact_unsigned_int = 4 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sa, .dst0w, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .{ .exact_unsigned_int = 4 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sh, .dst0w, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sa, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sh, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bmi2, null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .to_gpr, .none } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .each = .{ .once = &.{
+                        .{ ._, ._rx, .sa, .dst0d, .src0d, .src1d, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bmi2, null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .to_gpr, .none } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .each = .{ .once = &.{
+                        .{ ._, ._rx, .sh, .dst0d, .src0d, .src1d, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sa, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sh, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sa, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sh, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bmi2, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .to_gpr, .none } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .each = .{ .once = &.{
+                        .{ ._, ._rx, .sa, .dst0q, .src0q, .src1q, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bmi2, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .to_gpr, .none } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .each = .{ .once = &.{
+                        .{ ._, ._rx, .sh, .dst0q, .src0q, .src1q, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sa, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .sh, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_signed_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .byte },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cl }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1b, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .ui(63), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_signed_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .byte },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cl }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1b, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .ui(63), ._, ._ },
+                        .{ ._, ._c, .in, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_unsigned_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .byte },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cl }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1b, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_unsigned_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .byte },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cl }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1b, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._c, .in, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_signed_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .word },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cx }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1w, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .ui(63), ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_signed_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .word },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cx }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1w, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._r, .sa, .tmp3q, .ui(63), ._, ._ },
+                        .{ ._, ._c, .in, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_unsigned_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .word },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cx }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1w, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .add, .tmp0p, .si(1), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_unsigned_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .word },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cx }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1w, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .leasia(.none, .@"8", .tmp0, .sub_src0_size), ._, ._ },
+                        .{ ._, ._, .not, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsia(.dst0, .@"8", .tmp0, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memia(.src0q, .tmp1, .add_size), ._, ._ },
+                        .{ ._, ._rd, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leai(.tmp2q, .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .add, .tmp1p, .si(8), ._, ._ },
+                        .{ ._, ._nc, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._r, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsia(.dst0q, .@"8", .tmp0, .add_size), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._c, .in, .tmp0p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"0b", ._, ._, ._ },
+                    } },
+                } }) catch |err| switch (err) {
+                    error.SelectFailed => return cg.fail("failed to select {s} {} {} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(bin_op.lhs).fmt(pt),
+                        cg.typeOf(bin_op.rhs).fmt(pt),
+                        ops[0].tracking(cg),
+                        ops[1].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
+                try res[0].finish(inst, &.{ bin_op.lhs, bin_op.rhs }, &ops, cg);
+            },
+            .shl, .shl_exact => |air_tag| if (use_old) try cg.airShlShrBinOp(inst) else fallback: {
+                const bin_op = air_datas[@intFromEnum(inst)].bin_op;
+                if (cg.typeOf(bin_op.lhs).isVector(zcu)) break :fallback try cg.airShlShrBinOp(inst);
+                var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
+                var res: [1]Temp = undefined;
+                cg.select(&res, &.{cg.typeOf(bin_op.lhs)}, &ops, comptime &.{ .{
+                    .src_constraints = .{ .{ .signed_int = .byte }, .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sa, .dst0b, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sh, .dst0b, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .word }, .{ .exact_unsigned_int = 4 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sa, .dst0w, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .word }, .{ .exact_unsigned_int = 4 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sh, .dst0w, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sa, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sh, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .bmi2, null, null, null },
+                    .src_constraints = .{ .{ .int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .to_gpr, .none } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .each = .{ .once = &.{
+                        .{ ._, ._lx, .sh, .dst0d, .src0d, .src1d, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .signed_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sa, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .dword }, .{ .exact_unsigned_int = 5 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sh, .dst0d, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sa, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .imm8, .none } },
+                        .{ .src = .{ .to_mut_gpr, .imm8, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sh, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bmi2, null, null },
+                    .src_constraints = .{ .{ .int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .to_gpr, .none } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .each = .{ .once = &.{
+                        .{ ._, ._lx, .sh, .dst0q, .src0q, .src1q, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .signed_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sa, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .unsigned_int = .qword }, .{ .exact_unsigned_int = 6 }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mut_mem, .{ .to_reg = .cl }, .none } },
+                        .{ .src = .{ .to_mut_gpr, .{ .to_reg = .cl }, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._l, .sh, .dst0q, .src1b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .byte },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cl }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1b, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsid(.dst0, .@"8", .tmp0, 8), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._ld, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leasi(.tmp2q, .@"8", .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .sub, .tmp1d, .si(1), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(1), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .byte },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cl }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1b, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsid(.dst0, .@"8", .tmp0, 8), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._ld, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leasi(.tmp2q, .@"8", .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._c, .de, .tmp1d, ._, ._, ._ },
+                        .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._c, .de, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .word },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cx }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1w, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsid(.dst0, .@"8", .tmp0, 8), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._ld, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leasi(.tmp2q, .@"8", .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._, .sub, .tmp1d, .si(1), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(1), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .unsigned_int = .word },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .{ .to_reg = .cx }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .src1w, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._r, .sh, .tmp0d, .ui(6), ._, ._ },
+                        .{ ._, ._, .sub, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._, .lea, .tmp2p, .memsid(.dst0, .@"8", .tmp0, 8), ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._, .mov, .tmp4q, .memsi(.src0q, .@"8", .tmp1), ._, ._ },
+                        .{ ._, ._ld, .sh, .tmp3q, .tmp4q, .src1b, ._ },
+                        .{ ._, ._, .mov, .leasi(.tmp2q, .@"8", .tmp1), .tmp3q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp3q, .tmp4q, ._, ._ },
+                        .{ .@"1:", ._c, .de, .tmp1d, ._, ._, ._ },
+                        .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp3q, .src1b, ._, ._ },
+                        .{ .@"0:", ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._c, .de, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
+                    } },
+                } }) catch |err| switch (err) {
+                    error.SelectFailed => return cg.fail("failed to select {s} {} {} {} {}", .{
+                        @tagName(air_tag),
+                        cg.typeOf(bin_op.lhs).fmt(pt),
+                        cg.typeOf(bin_op.rhs).fmt(pt),
+                        ops[0].tracking(cg),
+                        ops[1].tracking(cg),
+                    }),
+                    else => |e| return e,
+                };
+                switch (air_tag) {
+                    else => unreachable,
+                    .shl => res[0].wrapInt(cg) catch |err| switch (err) {
+                        error.SelectFailed => return cg.fail("failed to select wrap {} {} {} {}", .{
+                            cg.typeOf(bin_op.lhs).fmt(pt),
+                            cg.typeOf(bin_op.rhs).fmt(pt),
+                            ops[0].tracking(cg),
+                            ops[1].tracking(cg),
+                        }),
+                        else => |e| return e,
+                    },
+                    .shl_exact => {},
+                }
+                try res[0].finish(inst, &.{ bin_op.lhs, bin_op.rhs }, &ops, cg);
+            },
             .not => |air_tag| if (use_old) try cg.airUnOp(inst, air_tag) else {
                 const ty_op = air_datas[@intFromEnum(inst)].ty_op;
                 var ops = try cg.tempsFromOperands(inst, .{ty_op.operand});
@@ -85342,7 +86280,7 @@ fn regClassForType(self: *CodeGen, ty: Type) Register.Class {
 fn regSetForRegClass(rc: Register.Class) RegisterManager.RegisterBitSet {
     return switch (rc) {
         .general_purpose => abi.RegisterClass.gp,
-        .segment, .ip, .cr, .dr => unreachable,
+        .gphi, .segment, .ip, .cr, .dr => unreachable,
         .x87 => abi.RegisterClass.x87,
         .mmx => @panic("TODO"),
         .sse => abi.RegisterClass.sse,
@@ -97763,7 +98701,7 @@ fn moveStrategy(cg: *CodeGen, ty: Type, class: Register.Class, aligned: bool) !M
     const pt = cg.pt;
     const zcu = pt.zcu;
     switch (class) {
-        .general_purpose, .segment => return .{ .load_store = .{ ._, .mov } },
+        .general_purpose, .gphi, .segment => return .{ .load_store = .{ ._, .mov } },
         .x87 => return .load_store_x87,
         .mmx => {},
         .sse => switch (ty.zigTypeTag(zcu)) {
@@ -98239,7 +99177,7 @@ fn genSetReg(
         .reserved_frame,
         => unreachable,
         .undef => if (opts.safety) switch (dst_reg.class()) {
-            .general_purpose => switch (abi_size) {
+            .general_purpose, .gphi => switch (abi_size) {
                 1 => try self.asmRegisterImmediate(.{ ._, .mov }, dst_reg.to8(), .u(0xaa)),
                 2 => try self.asmRegisterImmediate(.{ ._, .mov }, dst_reg.to16(), .u(0xaaaa)),
                 3...4 => try self.asmRegisterImmediate(
@@ -98296,8 +99234,8 @@ fn genSetReg(
             }
         },
         .register => |src_reg| if (dst_reg.id() != src_reg.id()) switch (dst_reg.class()) {
-            .general_purpose => switch (src_reg.class()) {
-                .general_purpose => try self.asmRegisterRegister(
+            .general_purpose, .gphi => switch (src_reg.class()) {
+                .general_purpose, .gphi => try self.asmRegisterRegister(
                     .{ ._, .mov },
                     dst_alias,
                     registerAlias(src_reg, abi_size),
@@ -98341,13 +99279,13 @@ fn genSetReg(
                 .{ ._, .mov },
                 dst_reg,
                 switch (src_reg.class()) {
-                    .general_purpose, .segment => registerAlias(src_reg, abi_size),
+                    .general_purpose, .gphi, .segment => registerAlias(src_reg, abi_size),
                     .x87, .mmx, .ip, .cr, .dr => unreachable,
                     .sse => try self.copyToTmpRegister(ty, src_mcv),
                 },
             ),
             .x87 => switch (src_reg.class()) {
-                .general_purpose, .segment => unreachable,
+                .general_purpose, .gphi, .segment => unreachable,
                 .x87 => switch (src_reg) {
                     .st0 => try self.asmRegister(.{ .f_, .st }, dst_reg),
                     .st1, .st2, .st3, .st4, .st5, .st6 => switch (dst_reg) {
@@ -98376,7 +99314,7 @@ fn genSetReg(
             },
             .mmx => unreachable,
             .sse => switch (src_reg.class()) {
-                .general_purpose => if (self.hasFeature(.sse2)) try self.asmRegisterRegister(
+                .general_purpose, .gphi => if (self.hasFeature(.sse2)) try self.asmRegisterRegister(
                     switch (abi_size) {
                         1...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov },
                         5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov },
@@ -98602,7 +99540,7 @@ fn genSetReg(
                         } },
                     }),
                 .load_symbol => |sym_off| switch (dst_reg.class()) {
-                    .general_purpose => {
+                    .general_purpose, .gphi => {
                         assert(sym_off.off == 0);
                         try self.asmRegisterMemory(.{ ._, .mov }, dst_alias, .{
                             .base = .{ .reloc = sym_off.sym_index },
@@ -98617,7 +99555,7 @@ fn genSetReg(
                     .x87, .sse => {},
                 },
                 .load_direct => |sym_index| switch (dst_reg.class()) {
-                    .general_purpose => {
+                    .general_purpose, .gphi => {
                         _ = try self.addInst(.{
                             .tag = .mov,
                             .ops = .direct_reloc,
@@ -98781,7 +99719,7 @@ fn genSetMem(
             };
             const src_alias = registerAlias(src_reg, abi_size);
             const src_size: u32 = @intCast(switch (src_alias.class()) {
-                .general_purpose, .segment, .x87, .ip, .cr, .dr => @divExact(src_alias.bitSize(), 8),
+                .general_purpose, .gphi, .segment, .x87, .ip, .cr, .dr => @divExact(src_alias.bitSize(), 8),
                 .mmx, .sse => abi_size,
             });
             const src_align: InternPool.Alignment = .fromNonzeroByteUnits(
@@ -103089,7 +104027,7 @@ fn resolveCallingConventionValues(
                 const ret_gpr = abi.getCAbiIntReturnRegs(cc);
                 const ret_size: u31 = @intCast(ret_ty.abiSize(zcu));
                 if (abi.zigcc.return_in_regs) switch (self.regClassForType(ret_ty)) {
-                    .general_purpose => if (ret_size <= @as(u4, switch (self.target.cpu.arch) {
+                    .general_purpose, .gphi => if (ret_size <= @as(u4, switch (self.target.cpu.arch) {
                         else => unreachable,
                         .x86 => 4,
                         .x86_64 => 8,
@@ -103119,7 +104057,7 @@ fn resolveCallingConventionValues(
                 }
                 const param_size: u31 = @intCast(param_ty.abiSize(zcu));
                 if (abi.zigcc.params_in_regs) switch (self.regClassForType(param_ty)) {
-                    .general_purpose => if (param_gpr.len >= 1 and param_size <= @as(u4, switch (self.target.cpu.arch) {
+                    .general_purpose, .gphi => if (param_gpr.len >= 1 and param_size <= @as(u4, switch (self.target.cpu.arch) {
                         else => unreachable,
                         .x86 => 4,
                         .x86_64 => 8,
@@ -103192,10 +104130,9 @@ fn parseRegName(name: []const u8) ?Register {
 
 /// Returns register wide enough to hold at least `size_bytes`.
 fn registerAlias(reg: Register, size_bytes: u32) Register {
+    if (size_bytes == 0) unreachable; // should be comptime-known
     return switch (reg.class()) {
-        .general_purpose => if (size_bytes == 0)
-            unreachable // should be comptime-known
-        else if (size_bytes <= 1)
+        .general_purpose => if (size_bytes <= 1)
             reg.to8()
         else if (size_bytes <= 2)
             reg.to16()
@@ -103205,6 +104142,16 @@ fn registerAlias(reg: Register, size_bytes: u32) Register {
             reg.to64()
         else
             unreachable,
+        .gphi => if (size_bytes <= 1)
+            reg
+        else if (size_bytes <= 2)
+            reg.to16()
+        else if (size_bytes <= 4)
+            reg.to32()
+        else if (size_bytes <= 8)
+            reg.to64()
+        else
+            unreachable,
         .segment => if (size_bytes <= 2)
             reg
         else
@@ -104566,6 +105513,7 @@ const Temp = struct {
             .required_features = .{ .@"64bit", .bmi2, null, null },
             .src_constraints = .{ .{ .unsigned_int = .qword }, .any, .any },
             .patterns = &.{
+                .{ .src = .{ .mem, .none, .none } },
                 .{ .src = .{ .to_gpr, .none, .none } },
             },
             .extra_temps = .{
@@ -104594,6 +105542,19 @@ const Temp = struct {
                 .{ .src = .{ .mut_mem, .none, .none } },
                 .{ .src = .{ .to_mut_gpr, .none, .none } },
             },
+            .extra_temps = .{
+                .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+                .unused,
+            },
             .dst_temps = .{ .{ .ref = .src0 }, .unused },
             .clobbers = .{ .eflags = true },
             .each = .{ .once = &.{
@@ -108625,7 +109586,7 @@ const Temp = struct {
     ) InnerError!void {
         const tomb_bits = cg.liveness.getTombBits(inst);
         for (0.., op_refs, op_temps) |op_index, op_ref, op_temp| {
-            if (op_temp.index != temp.index) try op_temp.die(cg);
+            if (op_temp.index != temp.index and op_temp.tracking(cg).short != .dead) try op_temp.die(cg);
             if (tomb_bits & @as(Liveness.Bpi, 1) << @intCast(op_index) == 0) continue;
             if (cg.reused_operands.isSet(op_index)) continue;
             try cg.processDeath(op_ref.toIndexAllowNone() orelse continue);
@@ -110197,7 +111158,10 @@ const Select = struct {
             }
 
             fn valueOf(ref: Ref, s: *const Select) MCValue {
-                return s.temps[@intFromEnum(ref)].tracking(s.cg).short;
+                return switch (ref) {
+                    .none => .none,
+                    else => s.temps[@intFromEnum(ref)].tracking(s.cg).short,
+                };
             }
         };
 
@@ -110742,42 +111706,39 @@ const Select = struct {
                 .lea => .{ .mem = .{
                     .base = switch (op.flags.base.ref.valueOf(s)) {
                         else => unreachable,
+                        .none => .none,
                         .register => |base_reg| .{ .reg = registerAlias(base_reg, @divExact(s.cg.target.ptrBitWidth(), 8)) },
                         .register_offset => |base_reg_off| .{ .reg = registerAlias(base_reg_off.reg, @divExact(s.cg.target.ptrBitWidth(), 8)) },
                         .lea_symbol => |base_sym_off| .{ .reloc = base_sym_off.sym_index },
                     },
                     .mod = .{ .rm = .{
                         .size = op.flags.base.size,
-                        .index = switch (op.flags.index.ref) {
-                            else => |index_ref| switch (index_ref.valueOf(s)) {
-                                else => unreachable,
-                                .register => |index_reg| registerAlias(index_reg, @divExact(s.cg.target.ptrBitWidth(), 8)),
-                                .register_offset => |index_reg_off| registerAlias(index_reg_off.reg, @divExact(s.cg.target.ptrBitWidth(), 8)),
-                            },
+                        .index = switch (op.flags.index.ref.valueOf(s)) {
+                            else => unreachable,
                             .none => .none,
+                            .register => |index_reg| registerAlias(index_reg, @divExact(s.cg.target.ptrBitWidth(), 8)),
+                            .register_offset => |index_reg_off| registerAlias(index_reg_off.reg, @divExact(s.cg.target.ptrBitWidth(), 8)),
                         },
                         .scale = op.flags.index.scale,
                         .disp = op.adjustedImm(i32, s) + switch (op.flags.base.ref.valueOf(s)) {
                             else => unreachable,
-                            .register => 0,
+                            .none, .register => 0,
+                            .register_offset => |base_reg_off| base_reg_off.off,
+                            .lea_symbol => |base_sym_off| base_sym_off.off,
+                        } + switch (op.flags.index.ref.valueOf(s)) {
+                            else => unreachable,
+                            .none, .register => 0,
                             .register_offset => |base_reg_off| base_reg_off.off,
                             .lea_symbol => |base_sym_off| base_sym_off.off,
-                        } + switch (op.flags.index.ref) {
-                            else => |index_ref| switch (index_ref.valueOf(s)) {
-                                else => unreachable,
-                                .register => 0,
-                                .register_offset => |base_reg_off| base_reg_off.off,
-                                .lea_symbol => |base_sym_off| base_sym_off.off,
-                            },
-                            .none => 0,
                         },
                     } },
                 } },
                 .mem => .{ .mem = try op.flags.base.ref.valueOf(s).mem(s.cg, .{
                     .size = op.flags.base.size,
-                    .index = switch (op.flags.index.ref) {
-                        else => |index_ref| registerAlias(index_ref.valueOf(s).register, @divExact(s.cg.target.ptrBitWidth(), 8)),
+                    .index = switch (op.flags.index.ref.valueOf(s)) {
+                        else => unreachable,
                         .none => .none,
+                        .register => |index_reg| registerAlias(index_reg, @divExact(s.cg.target.ptrBitWidth(), 8)),
                     },
                     .scale = op.flags.index.scale,
                     .disp = op.adjustedImm(i32, s),
src/arch/x86_64/Encoding.zig
@@ -592,6 +592,7 @@ pub const Op = enum {
                         else => unreachable,
                     },
                 },
+                .gphi => .r8,
                 .segment => .sreg,
                 .x87 => switch (reg) {
                     .st0 => .st0,
test/behavior/x86_64/binary.zig
@@ -1,4 +1,5 @@
 const AddOneBit = math.AddOneBit;
+const cast = math.cast;
 const checkExpected = math.checkExpected;
 const Compare = math.Compare;
 const DoubleBits = math.DoubleBits;
@@ -6,6 +7,7 @@ const fmax = math.fmax;
 const fmin = math.fmin;
 const Gpr = math.Gpr;
 const inf = math.inf;
+const Log2Int = math.Log2Int;
 const math = @import("math.zig");
 const nan = math.nan;
 const Scalar = math.Scalar;
@@ -5582,6 +5584,28 @@ test mod {
     try test_mod.testFloatVectors();
 }
 
+inline fn max(comptime Type: type, lhs: Type, rhs: Type) Type {
+    return @max(lhs, rhs);
+}
+test max {
+    const test_max = binary(max, .{});
+    try test_max.testInts();
+    try test_max.testIntVectors();
+    try test_max.testFloats();
+    try test_max.testFloatVectors();
+}
+
+inline fn min(comptime Type: type, lhs: Type, rhs: Type) Type {
+    return @min(lhs, rhs);
+}
+test min {
+    const test_min = binary(min, .{});
+    try test_min.testInts();
+    try test_min.testIntVectors();
+    try test_min.testFloats();
+    try test_min.testFloatVectors();
+}
+
 inline fn equal(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs == rhs) {
     return lhs == rhs;
 }
@@ -5654,35 +5678,55 @@ test bitOr {
     try test_bit_or.testIntVectors();
 }
 
-inline fn bitXor(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs ^ rhs) {
-    return lhs ^ rhs;
+inline fn shr(comptime Type: type, lhs: Type, rhs: Type) Type {
+    const bit_cast_rhs: @Type(.{ .int = .{ .signedness = .unsigned, .bits = @bitSizeOf(Type) } }) = @bitCast(rhs);
+    const truncate_rhs: Log2Int(Type) = @truncate(bit_cast_rhs);
+    return lhs >> if (comptime cast(Log2Int(Type), @bitSizeOf(Type))) |bits| truncate_rhs % bits else truncate_rhs;
 }
-test bitXor {
-    const test_bit_xor = binary(bitXor, .{});
-    try test_bit_xor.testInts();
-    try test_bit_xor.testIntVectors();
+test shr {
+    const test_shr = binary(shr, .{});
+    try test_shr.testInts();
 }
 
-inline fn min(comptime Type: type, lhs: Type, rhs: Type) Type {
-    return @min(lhs, rhs);
+inline fn shrExact(comptime Type: type, lhs: Type, rhs: Type) Type {
+    const bit_cast_rhs: @Type(.{ .int = .{ .signedness = .unsigned, .bits = @bitSizeOf(Type) } }) = @bitCast(rhs);
+    const truncate_rhs: Log2Int(Type) = @truncate(bit_cast_rhs);
+    const final_rhs = if (comptime cast(Log2Int(Type), @bitSizeOf(Type))) |bits| truncate_rhs % bits else truncate_rhs;
+    return @shrExact(lhs >> final_rhs << final_rhs, final_rhs);
 }
-test min {
-    const test_min = binary(min, .{});
-    try test_min.testInts();
-    try test_min.testIntVectors();
-    try test_min.testFloats();
-    try test_min.testFloatVectors();
+test shrExact {
+    const test_shr_exact = binary(shrExact, .{});
+    try test_shr_exact.testInts();
 }
 
-inline fn max(comptime Type: type, lhs: Type, rhs: Type) Type {
-    return @max(lhs, rhs);
+inline fn shl(comptime Type: type, lhs: Type, rhs: Type) Type {
+    const bit_cast_rhs: @Type(.{ .int = .{ .signedness = .unsigned, .bits = @bitSizeOf(Type) } }) = @bitCast(rhs);
+    const truncate_rhs: Log2Int(Type) = @truncate(bit_cast_rhs);
+    return lhs << if (comptime cast(Log2Int(Type), @bitSizeOf(Type))) |bits| truncate_rhs % bits else truncate_rhs;
 }
-test max {
-    const test_max = binary(max, .{});
-    try test_max.testInts();
-    try test_max.testIntVectors();
-    try test_max.testFloats();
-    try test_max.testFloatVectors();
+test shl {
+    const test_shl = binary(shl, .{});
+    try test_shl.testInts();
+}
+
+inline fn shlExact(comptime Type: type, lhs: Type, rhs: Type) Type {
+    const bit_cast_rhs: @Type(.{ .int = .{ .signedness = .unsigned, .bits = @bitSizeOf(Type) } }) = @bitCast(rhs);
+    const truncate_rhs: Log2Int(Type) = @truncate(bit_cast_rhs);
+    const final_rhs = if (comptime cast(Log2Int(Type), @bitSizeOf(Type))) |bits| truncate_rhs % bits else truncate_rhs;
+    return @shlExact(lhs << final_rhs >> final_rhs, final_rhs);
+}
+test shlExact {
+    const test_shl_exact = binary(shlExact, .{});
+    try test_shl_exact.testInts();
+}
+
+inline fn bitXor(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs ^ rhs) {
+    return lhs ^ rhs;
+}
+test bitXor {
+    const test_bit_xor = binary(bitXor, .{});
+    try test_bit_xor.testInts();
+    try test_bit_xor.testIntVectors();
 }
 
 inline fn optionalsEqual(comptime Type: type, lhs: Type, rhs: Type) bool {
test/behavior/x86_64/math.zig
@@ -2,6 +2,7 @@ const builtin = @import("builtin");
 const math = std.math;
 const std = @import("std");
 
+pub const cast = math.cast;
 pub const fmax = math.floatMax;
 pub const fmin = math.floatMin;
 pub const imax = math.maxInt;