Commit dcc9fe322e

Jacob Young <jacobly0@users.noreply.github.com>
2025-02-15 09:45:08
x86_64: rewrite unsafe scalar int multiplication
1 parent 5db585f
Changed files (8)
src/arch/x86_64/CodeGen.zig
@@ -6036,10 +6036,377 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
             .sub_safe => unreachable,
             .mul, .mul_optimized => |air_tag| if (use_old) try cg.airMulDivBinOp(inst, .mul) else fallback: {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
-                if (cg.floatBits(cg.typeOf(bin_op.lhs).scalarType(zcu)) == null) break :fallback try cg.airMulDivBinOp(inst, .mul);
+                const ty = cg.typeOf(bin_op.lhs);
+                if (ty.isVector(zcu) and cg.floatBits(ty.childType(zcu)) == null) break :fallback try cg.airMulDivBinOp(inst, .mul);
                 var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
                 var res: [1]Temp = undefined;
-                cg.select(&res, &.{cg.typeOf(bin_op.lhs)}, &ops, comptime &.{ .{
+                cg.select(&res, &.{ty}, &ops, comptime &.{ .{
+                    .src_constraints = .{ .{ .signed_int = .byte }, .{ .signed_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .{ .to_reg = .al }, .mem, .none } },
+                        .{ .src = .{ .mem, .{ .to_reg = .al }, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .{ .to_reg = .al }, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .src1b, ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .unsigned_int = .byte }, .{ .unsigned_int = .byte }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .{ .to_reg = .al }, .mem, .none } },
+                        .{ .src = .{ .mem, .{ .to_reg = .al }, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .{ .to_reg = .al }, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mul, .src1b, ._, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .int = .word }, .{ .int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .imm16, .none } },
+                        .{ .src = .{ .imm16, .mem, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_gpr, .imm16, .none } },
+                        .{ .src = .{ .imm16, .to_gpr, .none }, .commute = .{ 0, 1 } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .dst0w, .src0w, .src1w, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .int = .word }, .{ .int = .word }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .mem, .none } },
+                        .{ .src = .{ .mem, .to_mut_gpr, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .dst0w, .src1w, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .imm32, .none } },
+                        .{ .src = .{ .imm32, .mem, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_gpr, .imm32, .none } },
+                        .{ .src = .{ .imm32, .to_gpr, .none }, .commute = .{ 0, 1 } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .dst0d, .src0d, .src1d, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .int = .dword }, .{ .int = .dword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .mem, .none } },
+                        .{ .src = .{ .mem, .to_mut_gpr, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .dst0d, .src1d, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .mem, .simm32, .none } },
+                        .{ .src = .{ .simm32, .mem, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_gpr, .simm32, .none } },
+                        .{ .src = .{ .simm32, .to_gpr, .none }, .commute = .{ 0, 1 } },
+                    },
+                    .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .dst0q, .src0q, .src1q, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .int = .qword }, .{ .int = .qword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mut_gpr, .mem, .none } },
+                        .{ .src = .{ .mem, .to_mut_gpr, .none }, .commute = .{ 0, 1 } },
+                        .{ .src = .{ .to_mut_gpr, .to_gpr, .none } },
+                    },
+                    .dst_temps = .{ .{ .ref = .src0 }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, .i_, .mul, .dst0q, .src1q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{ .{ .int = .xword }, .{ .int = .xword }, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u64, .kind = .{ .reg = .rax } },
+                        .{ .type = .u64, .kind = .{ .reg = .rdx } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0q, .src0q, ._, ._ },
+                        .{ ._, ._, .mul, .src1q, ._, ._, ._ },
+                        .{ ._, ._, .mov, .dst0q, .tmp0q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0q, .src0q, ._, ._ },
+                        .{ ._, .i_, .mul, .tmp0q, .memd(.src1q, 8), ._, ._ },
+                        .{ ._, ._, .add, .tmp1q, .tmp0q, ._, ._ },
+                        .{ ._, ._, .mov, .tmp0q, .src1q, ._, ._ },
+                        .{ ._, .i_, .mul, .tmp0q, .memd(.src0q, 8), ._, ._ },
+                        .{ ._, ._, .add, .tmp1q, .tmp0q, ._, ._ },
+                        .{ ._, ._, .mov, .memd(.dst0q, 8), .tmp1q, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bmi2, .adx, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .reg = .rdx } },
+                        .{ .type = .isize, .kind = .{ .reg = .rcx } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-8, .src0, .add_size), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .mem(.src1), ._, ._ },
+                        .{ .@"0:", ._, .xor, .tmp2d, .tmp2d, ._, ._ },
+                        .{ ._, ._, .@"or", .tmp2q, .memi(.src0q, .tmp0), ._, ._ },
+                        .{ ._, ._z, .j, .@"2f", ._, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3p, .leaad(.tmp0, .sub_src0_size, 8), ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ .@"1:", ._x, .mul, .tmp6q, .tmp5q, .leai(.tmp1q, .tmp3), ._ },
+                        .{ ._, ._x, .adc, .tmp5q, .tmp4q, ._, ._ },
+                        .{ ._, ._, .mov, .memiad(.dst0q, .tmp3, .add_size, -8), .tmp5q, ._, ._ },
+                        .{ ._, ._rcxz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._x, .ado, .tmp6q, .memia(.dst0q, .tmp3, .add_size), ._, ._ },
+                        .{ ._, ._, .mov, .tmp4q, .tmp6q, ._, ._ },
+                        .{ ._, ._, .lea, .tmp3p, .lead(.tmp3, 8), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1b", ._, ._, ._ },
+                        .{ .@"2:", ._, .mov, .memi(.dst0q, .tmp0), .tmp2q, ._, ._ },
+                        .{ .@"1:", ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(8), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bmi2, .slow_incdec, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .reg = .rdx } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .memd(.src1, 8), ._, ._ },
+                        .{ .@"0:", ._, .lea, .tmp2p, .leaa(.tmp0, .sub_src0_size_div_8), ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp5d, .tmp5d, ._, ._ },
+                        .{ ._, ._, .@"or", .tmp3q, .memsi(.src0q, .@"8", .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"2f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._mp, .j, .@"3f", ._, ._, ._ },
+                        .{ .@"1:", ._, .adc, .tmp7q, .memsia(.dst0q, .@"8", .tmp2, .add_size), ._, ._ },
+                        .{ ._, ._, .adc, .tmp4b, .si(0), ._, ._ },
+                        .{ ._, ._, .mov, .tmp5q, .tmp7q, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp4b, .ui(4), ._, ._ },
+                        .{ .@"2:", ._x, .mul, .tmp7q, .tmp6q, .leasi(.tmp1q, .@"8", .tmp2), ._ },
+                        .{ ._, ._, .adc, .tmp6q, .tmp5q, ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0q, .@"8", .tmp2, .add_size), .tmp6q, ._, ._ },
+                        .{ ._, ._c, .in, .tmp2p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
+                        .{ .@"3:", ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(1), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .bmi2, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .reg = .rdx } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .memd(.src1, 8), ._, ._ },
+                        .{ .@"0:", ._, .lea, .tmp2p, .leaa(.tmp0, .sub_src0_size_div_8), ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp5d, .tmp5d, ._, ._ },
+                        .{ ._, ._, .@"or", .tmp3q, .memsi(.src0q, .@"8", .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"2f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._mp, .j, .@"3f", ._, ._, ._ },
+                        .{ .@"1:", ._, .adc, .tmp7q, .memsia(.dst0q, .@"8", .tmp2, .add_size), ._, ._ },
+                        .{ ._, ._, .adc, .tmp4b, .si(0), ._, ._ },
+                        .{ ._, ._, .mov, .tmp5q, .tmp7q, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp4b, .ui(4), ._, ._ },
+                        .{ .@"2:", ._x, .mul, .tmp7q, .tmp6q, .leasi(.tmp1q, .@"8", .tmp2), ._ },
+                        .{ ._, ._, .adc, .tmp6q, .tmp5q, ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0q, .@"8", .tmp2, .add_size), .tmp6q, ._, ._ },
+                        .{ ._, ._c, .in, .tmp2p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
+                        .{ .@"3:", ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
+                        .{ ._, ._c, .de, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", .slow_incdec, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .reg = .rax } },
+                        .{ .type = .u64, .kind = .{ .reg = .rdx } },
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .memd(.src1, 8), ._, ._ },
+                        .{ .@"0:", ._, .lea, .tmp2p, .leaa(.tmp0, .sub_src0_size_div_8), ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp5d, .tmp5d, ._, ._ },
+                        .{ ._, ._, .@"or", .tmp3q, .memsi(.src0q, .@"8", .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"2f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._mp, .j, .@"3f", ._, ._, ._ },
+                        .{ .@"1:", ._, .adc, .tmp7q, .memsia(.dst0q, .@"8", .tmp2, .add_size), ._, ._ },
+                        .{ ._, ._, .adc, .tmp4b, .si(0), ._, ._ },
+                        .{ ._, ._, .mov, .tmp5q, .tmp7q, ._, ._ },
+                        .{ .@"2:", ._, .mov, .tmp6q, .tmp3q, ._, ._ },
+                        .{ ._, ._, .mul, .leasi(.tmp1q, .@"8", .tmp2), ._, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp4b, .ui(4), ._, ._ },
+                        .{ ._, ._, .adc, .tmp6q, .tmp5q, ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0q, .@"8", .tmp2, .add_size), .tmp6q, ._, ._ },
+                        .{ ._, ._c, .in, .tmp2p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
+                        .{ .@"3:", ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
+                        .{ ._, ._, .sub, .tmp0d, .si(1), ._, ._ },
+                        .{ ._, ._ae, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .@"64bit", null, null, null },
+                    .src_constraints = .{
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .{ .remainder_int = .{ .of = .qword, .is = .qword } },
+                        .any,
+                    },
+                    .patterns = &.{
+                        .{ .src = .{ .to_mem, .to_mem, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .usize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .isize, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u64, .kind = .{ .reg = .rax } },
+                        .{ .type = .u64, .kind = .{ .reg = .rdx } },
+                        .unused,
+                    },
+                    .dst_temps = .{ .mem, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .sia(-1, .src0, .add_size_div_8), ._, ._ },
+                        .{ ._, ._, .lea, .tmp1p, .memd(.src1, 8), ._, ._ },
+                        .{ .@"0:", ._, .lea, .tmp2p, .leaa(.tmp0, .sub_src0_size_div_8), ._, ._ },
+                        .{ ._, ._, .xor, .tmp3d, .tmp3d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp4d, .tmp4d, ._, ._ },
+                        .{ ._, ._, .xor, .tmp5d, .tmp5d, ._, ._ },
+                        .{ ._, ._, .@"or", .tmp3q, .memsi(.src0q, .@"8", .tmp0), ._, ._ },
+                        .{ ._, ._nz, .j, .@"2f", ._, ._, ._ },
+                        .{ ._, ._, .mov, .memsi(.dst0q, .@"8", .tmp0), .tmp3q, ._, ._ },
+                        .{ ._, ._mp, .j, .@"3f", ._, ._, ._ },
+                        .{ .@"1:", ._, .adc, .tmp7q, .memsia(.dst0q, .@"8", .tmp2, .add_size), ._, ._ },
+                        .{ ._, ._, .adc, .tmp4b, .si(0), ._, ._ },
+                        .{ ._, ._, .mov, .tmp5q, .tmp7q, ._, ._ },
+                        .{ .@"2:", ._, .mov, .tmp6q, .tmp3q, ._, ._ },
+                        .{ ._, ._, .mul, .leasi(.tmp1q, .@"8", .tmp2), ._, ._, ._ },
+                        .{ ._, ._l, .sh, .tmp4b, .ui(4), ._, ._ },
+                        .{ ._, ._, .adc, .tmp6q, .tmp5q, ._, ._ },
+                        .{ ._, ._, .mov, .memsia(.dst0q, .@"8", .tmp2, .add_size), .tmp6q, ._, ._ },
+                        .{ ._, ._c, .in, .tmp2p, ._, ._, ._ },
+                        .{ ._, ._nz, .j, .@"1b", ._, ._, ._ },
+                        .{ .@"3:", ._, .lea, .tmp1p, .lead(.tmp1, 8), ._, ._ },
+                        .{ ._, ._c, .de, .tmp0d, ._, ._, ._ },
+                        .{ ._, ._ns, .j, .@"0b", ._, ._, ._ },
+                    } },
+                }, .{
                     .required_features = .{ .f16c, null, null, null },
                     .src_constraints = .{
                         .{ .scalar_float = .{ .of = .word, .is = .word } },
@@ -6890,7 +7257,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 } }) catch |err| switch (err) {
                     error.SelectFailed => return cg.fail("failed to select {s} {} {} {}", .{
                         @tagName(air_tag),
-                        cg.typeOf(bin_op.lhs).fmt(pt),
+                        ty.fmt(pt),
                         ops[0].tracking(cg),
                         ops[1].tracking(cg),
                     }),
@@ -92700,7 +93067,7 @@ const Select = struct {
         const mir_tag: Mir.Inst.FixedTag = .{ inst[1], inst[2] };
         pseudo: {
             switch (inst[0]) {
-                .@"0:", .@"1:", .@"2:" => |label| s.emitLabel(label),
+                .@"0:", .@"1:", .@"2:", .@"3:" => |label| s.emitLabel(label),
                 ._ => {},
                 .pseudo => break :pseudo,
             }
@@ -93578,7 +93945,7 @@ const Select = struct {
         Select.Operand,
         Select.Operand,
     };
-    const Label = enum { @"0:", @"1:", @"2:", @"_", pseudo };
+    const Label = enum { @"0:", @"1:", @"2:", @"3:", @"_", pseudo };
     const Operand = struct {
         flags: packed struct(u16) {
             tag: Tag,
@@ -93609,6 +93976,7 @@ const Select = struct {
                 ptr_size,
                 ptr_bit_size,
                 size,
+                src0_size,
                 delta_size,
                 delta_elem_size,
                 size_add_elem_size,
@@ -93641,6 +94009,8 @@ const Select = struct {
             const sub_size_div_8: Adjust = .{ .sign = .neg, .lhs = .size, .op = .div, .rhs = .@"8" };
             const sub_size_div_4: Adjust = .{ .sign = .neg, .lhs = .size, .op = .div, .rhs = .@"4" };
             const sub_size: Adjust = .{ .sign = .neg, .lhs = .size, .op = .mul, .rhs = .@"1" };
+            const sub_src0_size_div_8: Adjust = .{ .sign = .neg, .lhs = .src0_size, .op = .div, .rhs = .@"8" };
+            const sub_src0_size: Adjust = .{ .sign = .neg, .lhs = .src0_size, .op = .mul, .rhs = .@"1" };
             const add_delta_size_div_8: Adjust = .{ .sign = .pos, .lhs = .delta_size, .op = .div, .rhs = .@"8" };
             const add_delta_elem_size: Adjust = .{ .sign = .pos, .lhs = .delta_elem_size, .op = .mul, .rhs = .@"1" };
             const add_delta_elem_size_div_8: Adjust = .{ .sign = .pos, .lhs = .delta_elem_size, .op = .div, .rhs = .@"8" };
@@ -93882,6 +94252,8 @@ const Select = struct {
         const @"1f": Select.Operand = .{ .flags = .{ .tag = .forward_label }, .base = .{ .ref = .tmp1, .size = .none } };
         const @"2b": Select.Operand = .{ .flags = .{ .tag = .backward_label }, .base = .{ .ref = .tmp2, .size = .none } };
         const @"2f": Select.Operand = .{ .flags = .{ .tag = .forward_label }, .base = .{ .ref = .tmp2, .size = .none } };
+        const @"3b": Select.Operand = .{ .flags = .{ .tag = .backward_label }, .base = .{ .ref = .tmp3, .size = .none } };
+        const @"3f": Select.Operand = .{ .flags = .{ .tag = .forward_label }, .base = .{ .ref = .tmp3, .size = .none } };
 
         const tmp0b: Select.Operand = .{ .flags = .{ .tag = .ref }, .base = .tmp0b };
         const tmp0w: Select.Operand = .{ .flags = .{ .tag = .ref }, .base = .tmp0w };
@@ -94070,6 +94442,13 @@ const Select = struct {
                 .base = base,
             };
         }
+        fn leaad(base: Ref.Sized, adjust: Adjust, disp: i32) Select.Operand {
+            return .{
+                .flags = .{ .tag = .lea, .adjust = adjust },
+                .base = base,
+                .imm = disp,
+            };
+        }
         fn lead(base: Ref.Sized, disp: i32) Select.Operand {
             return .{
                 .flags = .{ .tag = .lea },
@@ -94226,6 +94605,7 @@ const Select = struct {
                 .ptr_size => @divExact(s.cg.target.ptrBitWidth(), 8),
                 .ptr_bit_size => s.cg.target.ptrBitWidth(),
                 .size => @intCast(op.base.ref.typeOf(s).abiSize(s.cg.pt.zcu)),
+                .src0_size => @intCast(Select.Operand.Ref.src0.typeOf(s).abiSize(s.cg.pt.zcu)),
                 .delta_size => @intCast(@as(SignedImm, @intCast(op.base.ref.typeOf(s).abiSize(s.cg.pt.zcu))) -
                     @as(SignedImm, @intCast(op.index.ref.typeOf(s).abiSize(s.cg.pt.zcu)))),
                 .delta_elem_size => @intCast(@as(SignedImm, @intCast(op.base.ref.typeOf(s).elemType2(s.cg.pt.zcu).abiSize(s.cg.pt.zcu))) -
src/arch/x86_64/Emit.zig
@@ -88,13 +88,32 @@ pub fn emitMir(emit: *Emit) Error!void {
                 lowered_relocs[0].lowered_inst_index == lowered_index) : ({
                 lowered_relocs = lowered_relocs[1..];
             }) switch (lowered_relocs[0].target) {
-                .inst => |target| try relocs.append(emit.lower.allocator, .{
-                    .source = start_offset,
-                    .source_offset = end_offset - 4,
-                    .target = target,
-                    .target_offset = lowered_relocs[0].off,
-                    .length = @intCast(end_offset - start_offset),
-                }),
+                .inst => |target| {
+                    const inst_length: u4 = @intCast(end_offset - start_offset);
+                    const reloc_offset, const reloc_length = reloc_offset_length: {
+                        var reloc_offset = inst_length;
+                        var op_index: usize = lowered_inst.ops.len;
+                        while (true) {
+                            op_index -= 1;
+                            const op = lowered_inst.encoding.data.ops[op_index];
+                            if (op == .none) continue;
+                            const enc_length: u4 = @intCast(
+                                std.math.divCeil(u7, @intCast(op.immBitSize()), 8) catch unreachable,
+                            );
+                            reloc_offset -= enc_length;
+                            if (op_index == lowered_relocs[0].op_index)
+                                break :reloc_offset_length .{ reloc_offset, enc_length };
+                        }
+                    };
+                    try relocs.append(emit.lower.allocator, .{
+                        .inst_offset = start_offset,
+                        .inst_length = inst_length,
+                        .source_offset = reloc_offset,
+                        .source_length = reloc_length,
+                        .target = target,
+                        .target_offset = lowered_relocs[0].off,
+                    });
+                },
                 .table => try table_relocs.append(emit.lower.allocator, .{
                     .source_offset = end_offset - 4,
                     .target_offset = lowered_relocs[0].off,
@@ -409,7 +428,7 @@ pub fn emitMir(emit: *Emit) Error!void {
                                     } } };
                                 },
                                 .pseudo_dbg_local_am => loc: {
-                                    const mem = emit.lower.mem(mir_inst.data.ax.payload);
+                                    const mem = emit.lower.mem(undefined, mir_inst.data.ax.payload);
                                     break :loc .{ mir_inst.data.ax.air_inst, .{ .plus = .{
                                         base: {
                                             loc_buf[0] = switch (mem.base()) {
@@ -466,15 +485,18 @@ pub fn emitMir(emit: *Emit) Error!void {
             }
         }
     }
-    {
-        // TODO this function currently assumes all relocs via JMP/CALL instructions are 32bit in size.
-        // This should be reversed like it is done in aarch64 MIR emit code: start with the smallest
-        // possible resolution, i.e., 8bit, and iteratively converge on the minimum required resolution
-        // until the entire decl is correctly emitted with all JMP/CALL instructions within range.
-        for (relocs.items) |reloc| {
-            const target = code_offset_mapping[reloc.target];
-            const disp = @as(i64, @intCast(target)) - @as(i64, @intCast(reloc.source + reloc.length)) + reloc.target_offset;
-            std.mem.writeInt(i32, emit.code.items[reloc.source_offset..][0..4], @intCast(disp), .little);
+    for (relocs.items) |reloc| {
+        const target = code_offset_mapping[reloc.target];
+        const disp = @as(i64, @intCast(target)) - @as(i64, @intCast(reloc.inst_offset + reloc.inst_length)) + reloc.target_offset;
+        const inst_bytes = emit.code.items[reloc.inst_offset..][0..reloc.inst_length];
+        switch (reloc.source_length) {
+            else => unreachable,
+            inline 1, 4 => |source_length| std.mem.writeInt(
+                @Type(.{ .int = .{ .signedness = .signed, .bits = @as(u16, 8) * source_length } }),
+                inst_bytes[reloc.source_offset..][0..source_length],
+                @intCast(disp),
+                .little,
+            ),
         }
     }
     if (emit.lower.mir.table.len > 0) {
@@ -511,15 +533,17 @@ fn fail(emit: *Emit, comptime format: []const u8, args: anytype) Error {
 
 const Reloc = struct {
     /// Offset of the instruction.
-    source: u32,
+    inst_offset: u32,
+    /// Length of the instruction.
+    inst_length: u4,
     /// Offset of the relocation within the instruction.
-    source_offset: u32,
+    source_offset: u4,
+    /// Length of the relocation.
+    source_length: u4,
     /// Target of the relocation.
     target: Mir.Inst.Index,
-    /// Offset from the target instruction.
+    /// Offset from the target.
     target_offset: i32,
-    /// Length of the instruction.
-    length: u5,
 };
 
 const TableReloc = struct {
src/arch/x86_64/Encoding.zig
@@ -304,20 +304,20 @@ pub const Mnemonic = enum {
     jnc, jne, jng, jnge, jnl, jnle, jno, jnp, jns, jnz, jo, jp, jpe, jpo, jrcxz, js, jz,
     lahf, lar, lea, leave, lfence, lgdt, lidt, lldt, lmsw, loop, loope, loopne,
     lods, lodsb, lodsd, lodsq, lodsw,
-    lsl, ltr, lzcnt,
+    lsl, ltr,
     mfence, mov, movbe,
     movs, movsb, movsd, movsq, movsw,
     movsx, movsxd, movzx, mul,
     neg, nop, not,
     @"or", out, outs, outsb, outsd, outsw,
-    pause, pop, popcnt, popf, popfd, popfq, push, pushfq,
+    pause, pop, popf, popfd, popfq, push, pushfq,
     rcl, rcr,
     rdfsbase, rdgsbase, rdmsr, rdpid, rdpkru, rdpmc, rdrand, rdseed, rdssd, rdssq, rdtsc, rdtscp,
-    ret, rol, ror, rorx, rsm,
-    sahf, sal, sar, sarx, sbb,
+    ret, rol, ror, rsm,
+    sahf, sal, sar, sbb,
     scas, scasb, scasd, scasq, scasw,
     senduipi, serialize,
-    shl, shld, shlx, shr, shrd, shrx,
+    shl, shld, shr, shrd,
     stac, stc, std, sti, str, stui,
     sub, swapgs, syscall, sysenter, sysexit, sysret,
     seta, setae, setb, setbe, setc, sete, setg, setge, setl, setle, setna, setnae,
@@ -433,6 +433,8 @@ pub const Mnemonic = enum {
     roundpd, roundps, roundsd, roundss,
     // SSE4.2
     crc32, pcmpgtq,
+    // ABM
+    lzcnt, popcnt,
     // PCLMUL
     pclmulqdq,
     // AES
@@ -440,7 +442,6 @@ pub const Mnemonic = enum {
     // SHA
     sha1rnds4, sha1nexte, sha1msg1, sha1msg2, sha256msg1, sha256msg2, sha256rnds2,
     // AVX
-    andn, bextr, blsi, blsmsk, blsr, bzhi, tzcnt,
     vaddpd, vaddps, vaddsd, vaddss, vaddsubpd, vaddsubps,
     vaesdec, vaesdeclast, vaesenc, vaesenclast, vaesimc, vaeskeygenassist,
     vandnpd, vandnps, vandpd, vandps,
@@ -506,6 +507,10 @@ pub const Mnemonic = enum {
     vtestpd, vtestps,
     vucomisd, vucomiss, vunpckhpd, vunpckhps, vunpcklpd, vunpcklps,
     vxorpd, vxorps,
+    // BMI
+    andn, bextr, blsi, blsmsk, blsr, tzcnt,
+    // BMI2
+    bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx,
     // F16C
     vcvtph2ps, vcvtps2ph,
     // FMA
src/arch/x86_64/encodings.zig
@@ -405,9 +405,9 @@ pub const table = [_]Entry{
     .{ .jb,    .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none,  .none     },
     .{ .jbe,   .d, &.{ .rel32 }, &.{ 0x0f, 0x86 }, 0, .none,  .none     },
     .{ .jc,    .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none,  .none     },
-    .{ .jcxz,  .d, &.{ .rel32 }, &.{ 0xe3       }, 0, .short, .@"32bit" },
-    .{ .jecxz, .d, &.{ .rel32 }, &.{ 0xe3       }, 0, .none,  .@"32bit" },
-    .{ .jrcxz, .d, &.{ .rel32 }, &.{ 0xe3       }, 0, .none,  .@"64bit" },
+    .{ .jcxz,  .d, &.{ .rel8  }, &.{ 0xe3       }, 0, .short, .@"32bit" },
+    .{ .jecxz, .d, &.{ .rel8  }, &.{ 0xe3       }, 0, .none,  .@"32bit" },
+    .{ .jrcxz, .d, &.{ .rel8  }, &.{ 0xe3       }, 0, .none,  .@"64bit" },
     .{ .je,    .d, &.{ .rel32 }, &.{ 0x0f, 0x84 }, 0, .none,  .none     },
     .{ .jg,    .d, &.{ .rel32 }, &.{ 0x0f, 0x8f }, 0, .none,  .none     },
     .{ .jge,   .d, &.{ .rel32 }, &.{ 0x0f, 0x8d }, 0, .none,  .none     },
@@ -477,10 +477,6 @@ pub const table = [_]Entry{
 
     .{ .ltr, .m, &.{ .rm16 }, &.{ 0x0f, 0x00 }, 3, .none, .none },
 
-    .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .short, .lzcnt },
-    .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none,  .lzcnt },
-    .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long,  .lzcnt },
-
     .{ .mfence, .z, &.{}, &.{ 0x0f, 0xae, 0xf0 }, 0, .none, .none },
 
     .{ .mov, .mr, &.{ .rm8,     .r8      }, &.{ 0x88 }, 0, .none,  .none },
@@ -630,10 +626,6 @@ pub const table = [_]Entry{
     .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .short, .none },
     .{ .pop, .m, &.{ .rm64 }, &.{ 0x8f }, 0, .none,  .none },
 
-    .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .short, .popcnt },
-    .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none,  .popcnt },
-    .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long,  .popcnt },
-
     .{ .popf,  .z, &.{}, &.{ 0x9d }, 0, .short, .none },
     .{ .popfd, .z, &.{}, &.{ 0x9d }, 0, .none,  .@"32bit" },
     .{ .popfq, .z, &.{}, &.{ 0x9d }, 0, .none,  .@"64bit" },
@@ -1738,6 +1730,15 @@ pub const table = [_]Entry{
 
     .{ .pcmpgtq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .none, .sse4_2 },
 
+    // ABM
+    .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .short, .lzcnt },
+    .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none,  .lzcnt },
+    .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long,  .lzcnt },
+
+    .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .short, .popcnt },
+    .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none,  .popcnt },
+    .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long,  .popcnt },
+
     // PCLMUL
     .{ .pclmulqdq, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .none, .pclmul },
 
@@ -1771,38 +1772,6 @@ pub const table = [_]Entry{
     .{ .sha256msg2, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x38, 0xcd }, 0, .none, .sha },
 
     // AVX
-    .{ .andn, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf2 }, 0, .vex_lz_w0, .bmi },
-    .{ .andn, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf2 }, 0, .vex_lz_w1, .bmi },
-
-    .{ .bextr, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi },
-    .{ .bextr, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi },
-
-    .{ .blsi, .vm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf3 }, 3, .vex_lz_w0, .bmi },
-    .{ .blsi, .vm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf3 }, 3, .vex_lz_w1, .bmi },
-
-    .{ .blsmsk, .vm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf3 }, 2, .vex_lz_w0, .bmi },
-    .{ .blsmsk, .vm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf3 }, 2, .vex_lz_w1, .bmi },
-
-    .{ .blsr, .vm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf3 }, 1, .vex_lz_w0, .bmi },
-    .{ .blsr, .vm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf3 }, 1, .vex_lz_w1, .bmi },
-
-    .{ .bzhi, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 },
-    .{ .bzhi, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 },
-
-    .{ .rorx, .rmi, &.{ .r32, .rm32, .imm8 }, &.{ 0xf2, 0x0f, 0x3a }, 0, .vex_lz_w0, .bmi2 },
-    .{ .rorx, .rmi, &.{ .r64, .rm64, .imm8 }, &.{ 0xf2, 0x0f, 0x3a }, 0, .vex_lz_w1, .bmi2 },
-
-    .{ .sarx, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0xf3, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi2 },
-    .{ .shlx, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0x66, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi2 },
-    .{ .shrx, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0xf2, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi2 },
-    .{ .sarx, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0xf3, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi2 },
-    .{ .shlx, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0x66, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi2 },
-    .{ .shrx, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0xf2, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi2 },
-
-    .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .short, .bmi },
-    .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none,  .bmi },
-    .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long,  .bmi },
-
     .{ .vaddpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_128_wig, .avx },
     .{ .vaddpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_256_wig, .avx },
 
@@ -2307,6 +2276,49 @@ pub const table = [_]Entry{
     .{ .vxorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .vex_128_wig, .avx },
     .{ .vxorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x57 }, 0, .vex_256_wig, .avx },
 
+    // BMI
+    .{ .andn, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf2 }, 0, .vex_lz_w0, .bmi },
+    .{ .andn, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf2 }, 0, .vex_lz_w1, .bmi },
+
+    .{ .bextr, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi },
+    .{ .bextr, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi },
+
+    .{ .blsi, .vm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf3 }, 3, .vex_lz_w0, .bmi },
+    .{ .blsi, .vm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf3 }, 3, .vex_lz_w1, .bmi },
+
+    .{ .blsmsk, .vm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf3 }, 2, .vex_lz_w0, .bmi },
+    .{ .blsmsk, .vm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf3 }, 2, .vex_lz_w1, .bmi },
+
+    .{ .blsr, .vm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x38, 0xf3 }, 1, .vex_lz_w0, .bmi },
+    .{ .blsr, .vm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x38, 0xf3 }, 1, .vex_lz_w1, .bmi },
+
+    .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .short, .bmi },
+    .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none,  .bmi },
+    .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long,  .bmi },
+
+    // BMI2
+    .{ .bzhi, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .bzhi, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 },
+
+    .{ .mulx, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf2, 0x0f, 0x38, 0xf6 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .mulx, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf2, 0x0f, 0x38, 0xf6 }, 0, .vex_lz_w1, .bmi2 },
+
+    .{ .pdep, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf2, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .pdep, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf2, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 },
+
+    .{ .pext, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf3, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .pext, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf3, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 },
+
+    .{ .rorx, .rmi, &.{ .r32, .rm32, .imm8 }, &.{ 0xf2, 0x0f, 0x3a }, 0, .vex_lz_w0, .bmi2 },
+    .{ .rorx, .rmi, &.{ .r64, .rm64, .imm8 }, &.{ 0xf2, 0x0f, 0x3a }, 0, .vex_lz_w1, .bmi2 },
+
+    .{ .sarx, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0xf3, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .shlx, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0x66, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .shrx, .rmv, &.{ .r32, .rm32, .r32 }, &.{ 0xf2, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .sarx, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0xf3, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi2 },
+    .{ .shlx, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0x66, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi2 },
+    .{ .shrx, .rmv, &.{ .r64, .rm64, .r64 }, &.{ 0xf2, 0x0f, 0x38, 0xf7 }, 0, .vex_lz_w1, .bmi2 },
+
     // F16C
     .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c },
     .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c },
src/arch/x86_64/Lower.zig
@@ -10,32 +10,38 @@ mir: Mir,
 cc: std.builtin.CallingConvention,
 err_msg: ?*Zcu.ErrorMsg = null,
 src_loc: Zcu.LazySrcLoc,
-result_insts_len: u8 = undefined,
-result_relocs_len: u8 = undefined,
-result_insts: [
-    @max(
-        1, // non-pseudo instructions
-        3, // (ELF only) TLS local dynamic (LD) sequence in PIC mode
-        2, // cmovcc: cmovcc \ cmovcc
-        3, // setcc: setcc \ setcc \ logicop
-        2, // jcc: jcc \ jcc
-        pseudo_probe_align_insts,
-        pseudo_probe_adjust_unrolled_max_insts,
-        pseudo_probe_adjust_setup_insts,
-        pseudo_probe_adjust_loop_insts,
-        abi.Win64.callee_preserved_regs.len * 2, // push_regs/pop_regs
-        abi.SysV.callee_preserved_regs.len * 2, // push_regs/pop_regs
-    )
-]Instruction = undefined,
-result_relocs: [
-    @max(
-        1, // jmp/jcc/call/mov/lea: jmp/jcc/call/mov/lea
-        2, // jcc: jcc \ jcc
-        2, // test \ jcc \ probe \ sub \ jmp
-        1, // probe \ sub \ jcc
-        3, // (ELF only) TLS local dynamic (LD) sequence in PIC mode
-    )
-]Reloc = undefined,
+result_insts_len: ResultInstIndex = undefined,
+result_insts: [max_result_insts]Instruction = undefined,
+result_relocs_len: ResultRelocIndex = undefined,
+result_relocs: [max_result_relocs]Reloc = undefined,
+
+const max_result_insts = @max(
+    1, // non-pseudo instructions
+    3, // (ELF only) TLS local dynamic (LD) sequence in PIC mode
+    2, // cmovcc: cmovcc \ cmovcc
+    3, // setcc: setcc \ setcc \ logicop
+    2, // jcc: jcc \ jcc
+    pseudo_probe_align_insts,
+    pseudo_probe_adjust_unrolled_max_insts,
+    pseudo_probe_adjust_setup_insts,
+    pseudo_probe_adjust_loop_insts,
+    abi.Win64.callee_preserved_regs.len * 2, // push_regs/pop_regs
+    abi.SysV.callee_preserved_regs.len * 2, // push_regs/pop_regs
+);
+const max_result_relocs = @max(
+    1, // jmp/jcc/call/mov/lea: jmp/jcc/call/mov/lea
+    2, // jcc: jcc \ jcc
+    2, // test \ jcc \ probe \ sub \ jmp
+    1, // probe \ sub \ jcc
+    3, // (ELF only) TLS local dynamic (LD) sequence in PIC mode
+);
+
+const ResultInstIndex = std.math.IntFittingRange(0, max_result_insts - 1);
+const ResultRelocIndex = std.math.IntFittingRange(0, max_result_relocs - 1);
+const InstOpIndex = std.math.IntFittingRange(
+    0,
+    @typeInfo(@FieldType(Instruction, "ops")).array.len - 1,
+);
 
 pub const pseudo_probe_align_insts = 5; // test \ jcc \ probe \ sub \ jmp
 pub const pseudo_probe_adjust_unrolled_max_insts =
@@ -51,7 +57,8 @@ pub const Error = error{
 };
 
 pub const Reloc = struct {
-    lowered_inst_index: u8,
+    lowered_inst_index: ResultInstIndex,
+    op_index: InstOpIndex,
     target: Target,
     off: i32,
 
@@ -114,11 +121,11 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
                 assert(inst.data.rx.fixes == ._);
                 try lower.emit(.none, .cmovnz, &.{
                     .{ .reg = inst.data.rx.r1 },
-                    .{ .mem = lower.mem(inst.data.rx.payload) },
+                    .{ .mem = lower.mem(1, inst.data.rx.payload) },
                 });
                 try lower.emit(.none, .cmovp, &.{
                     .{ .reg = inst.data.rx.r1 },
-                    .{ .mem = lower.mem(inst.data.rx.payload) },
+                    .{ .mem = lower.mem(1, inst.data.rx.payload) },
                 });
             },
             .pseudo_set_z_and_np_r => {
@@ -137,13 +144,13 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
             .pseudo_set_z_and_np_m => {
                 assert(inst.data.rx.fixes == ._);
                 try lower.emit(.none, .setz, &.{
-                    .{ .mem = lower.mem(inst.data.rx.payload) },
+                    .{ .mem = lower.mem(0, inst.data.rx.payload) },
                 });
                 try lower.emit(.none, .setnp, &.{
                     .{ .reg = inst.data.rx.r1 },
                 });
                 try lower.emit(.none, .@"and", &.{
-                    .{ .mem = lower.mem(inst.data.rx.payload) },
+                    .{ .mem = lower.mem(0, inst.data.rx.payload) },
                     .{ .reg = inst.data.rx.r1 },
                 });
             },
@@ -163,32 +170,32 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
             .pseudo_set_nz_or_p_m => {
                 assert(inst.data.rx.fixes == ._);
                 try lower.emit(.none, .setnz, &.{
-                    .{ .mem = lower.mem(inst.data.rx.payload) },
+                    .{ .mem = lower.mem(0, inst.data.rx.payload) },
                 });
                 try lower.emit(.none, .setp, &.{
                     .{ .reg = inst.data.rx.r1 },
                 });
                 try lower.emit(.none, .@"or", &.{
-                    .{ .mem = lower.mem(inst.data.rx.payload) },
+                    .{ .mem = lower.mem(0, inst.data.rx.payload) },
                     .{ .reg = inst.data.rx.r1 },
                 });
             },
             .pseudo_j_z_and_np_inst => {
                 assert(inst.data.inst.fixes == ._);
                 try lower.emit(.none, .jnz, &.{
-                    .{ .imm = lower.reloc(.{ .inst = index + 1 }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = index + 1 }, 0) },
                 });
                 try lower.emit(.none, .jnp, &.{
-                    .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = inst.data.inst.inst }, 0) },
                 });
             },
             .pseudo_j_nz_or_p_inst => {
                 assert(inst.data.inst.fixes == ._);
                 try lower.emit(.none, .jnz, &.{
-                    .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = inst.data.inst.inst }, 0) },
                 });
                 try lower.emit(.none, .jp, &.{
-                    .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = inst.data.inst.inst }, 0) },
                 });
             },
 
@@ -198,7 +205,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
                     .{ .imm = .s(@bitCast(inst.data.ri.i)) },
                 });
                 try lower.emit(.none, .jz, &.{
-                    .{ .imm = lower.reloc(.{ .inst = index + 1 }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = index + 1 }, 0) },
                 });
                 try lower.emit(.none, .lea, &.{
                     .{ .reg = inst.data.ri.r1 },
@@ -214,7 +221,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
                     .{ .reg = inst.data.ri.r1.to32() },
                 });
                 try lower.emit(.none, .jmp, &.{
-                    .{ .imm = lower.reloc(.{ .inst = index }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = index }, 0) },
                 });
                 assert(lower.result_insts_len == pseudo_probe_align_insts);
             },
@@ -260,7 +267,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
                     .{ .imm = .s(page_size) },
                 });
                 try lower.emit(.none, .jae, &.{
-                    .{ .imm = lower.reloc(.{ .inst = index }, 0) },
+                    .{ .imm = lower.reloc(0, .{ .inst = index }, 0) },
                 });
                 assert(lower.result_insts_len == pseudo_probe_adjust_loop_insts);
             },
@@ -382,21 +389,22 @@ pub fn imm(lower: *const Lower, ops: Mir.Inst.Ops, i: u32) Immediate {
     };
 }
 
-pub fn mem(lower: *Lower, payload: u32) Memory {
+pub fn mem(lower: *Lower, op_index: InstOpIndex, payload: u32) Memory {
     var m = lower.mir.resolveFrameLoc(lower.mir.extraData(Mir.Memory, payload).data).decode();
     switch (m) {
         .sib => |*sib| switch (sib.base) {
             else => {},
-            .table => sib.disp = lower.reloc(.table, sib.disp).signed,
+            .table => sib.disp = lower.reloc(op_index, .table, sib.disp).signed,
         },
         else => {},
     }
     return m;
 }
 
-fn reloc(lower: *Lower, target: Reloc.Target, off: i32) Immediate {
+fn reloc(lower: *Lower, op_index: InstOpIndex, target: Reloc.Target, off: i32) Immediate {
     lower.result_relocs[lower.result_relocs_len] = .{
         .lowered_inst_index = lower.result_insts_len,
+        .op_index = op_index,
         .target = target,
         .off = off,
     };
@@ -409,7 +417,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
     var emit_mnemonic = mnemonic;
     var emit_ops_storage: [4]Operand = undefined;
     const emit_ops = emit_ops_storage[0..ops.len];
-    for (emit_ops, ops) |*emit_op, op| {
+    for (emit_ops, ops, 0..) |*emit_op, op, op_index| {
         emit_op.* = switch (op) {
             else => op,
             .mem => |mem_op| switch (mem_op.base()) {
@@ -428,20 +436,20 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                             if (lower.pic) {
                                 // Here, we currently assume local dynamic TLS vars, and so
                                 // we emit LD model.
-                                _ = lower.reloc(.{ .linker_tlsld = sym_index }, 0);
+                                _ = lower.reloc(1, .{ .linker_tlsld = sym_index }, 0);
                                 lower.result_insts[lower.result_insts_len] = try .new(.none, .lea, &.{
                                     .{ .reg = .rdi },
                                     .{ .mem = Memory.initRip(.none, 0) },
                                 }, lower.target);
                                 lower.result_insts_len += 1;
-                                _ = lower.reloc(.{
+                                _ = lower.reloc(0, .{
                                     .linker_extern_fn = try elf_file.getGlobalSymbol("__tls_get_addr", null),
                                 }, 0);
                                 lower.result_insts[lower.result_insts_len] = try .new(.none, .call, &.{
                                     .{ .imm = .s(0) },
                                 }, lower.target);
                                 lower.result_insts_len += 1;
-                                _ = lower.reloc(.{ .linker_dtpoff = sym_index }, 0);
+                                _ = lower.reloc(@intCast(op_index), .{ .linker_dtpoff = sym_index }, 0);
                                 emit_mnemonic = .lea;
                                 break :op .{ .mem = Memory.initSib(.none, .{
                                     .base = .{ .reg = .rax },
@@ -454,7 +462,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                                     .{ .mem = Memory.initSib(.qword, .{ .base = .{ .reg = .fs } }) },
                                 }, lower.target);
                                 lower.result_insts_len += 1;
-                                _ = lower.reloc(.{ .linker_reloc = sym_index }, 0);
+                                _ = lower.reloc(@intCast(op_index), .{ .linker_reloc = sym_index }, 0);
                                 emit_mnemonic = .lea;
                                 break :op .{ .mem = Memory.initSib(.none, .{
                                     .base = .{ .reg = .rax },
@@ -463,15 +471,17 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                             }
                         }
 
-                        _ = lower.reloc(.{ .linker_reloc = sym_index }, 0);
                         if (lower.pic) switch (mnemonic) {
-                            .lea => if (elf_sym.flags.is_extern_ptr) {
+                            .lea => {
+                                _ = lower.reloc(@intCast(op_index), .{ .linker_reloc = sym_index }, 0);
+                                if (!elf_sym.flags.is_extern_ptr) break :op .{ .mem = Memory.initRip(.none, 0) };
                                 emit_mnemonic = .mov;
                                 break :op .{ .mem = Memory.initRip(.ptr, 0) };
-                            } else break :op .{ .mem = Memory.initRip(.none, 0) },
+                            },
                             .mov => {
                                 if (elf_sym.flags.is_extern_ptr) {
                                     const reg = ops[0].reg;
+                                    _ = lower.reloc(1, .{ .linker_reloc = sym_index }, 0);
                                     lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                         .{ .reg = reg.to64() },
                                         .{ .mem = Memory.initRip(.qword, 0) },
@@ -481,10 +491,13 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                                         .reg = reg.to64(),
                                     } }) };
                                 }
+                                _ = lower.reloc(@intCast(op_index), .{ .linker_reloc = sym_index }, 0);
                                 break :op .{ .mem = Memory.initRip(mem_op.sib.ptr_size, 0) };
                             },
                             else => unreachable,
-                        } else switch (mnemonic) {
+                        };
+                        _ = lower.reloc(@intCast(op_index), .{ .linker_reloc = sym_index }, 0);
+                        switch (mnemonic) {
                             .call => break :op .{ .mem = Memory.initSib(mem_op.sib.ptr_size, .{
                                 .base = .{ .reg = .ds },
                             }) },
@@ -502,7 +515,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                         const macho_sym = zo.symbols.items[sym_index];
 
                         if (macho_sym.flags.tlv) {
-                            _ = lower.reloc(.{ .linker_reloc = sym_index }, 0);
+                            _ = lower.reloc(1, .{ .linker_reloc = sym_index }, 0);
                             lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                 .{ .reg = .rdi },
                                 .{ .mem = Memory.initRip(.ptr, 0) },
@@ -516,15 +529,17 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                             break :op .{ .reg = .rax };
                         }
 
-                        _ = lower.reloc(.{ .linker_reloc = sym_index }, 0);
                         break :op switch (mnemonic) {
-                            .lea => if (macho_sym.flags.is_extern_ptr) {
+                            .lea => {
+                                _ = lower.reloc(@intCast(op_index), .{ .linker_reloc = sym_index }, 0);
+                                if (!macho_sym.flags.is_extern_ptr) break :op .{ .mem = Memory.initRip(.none, 0) };
                                 emit_mnemonic = .mov;
                                 break :op .{ .mem = Memory.initRip(.ptr, 0) };
-                            } else break :op .{ .mem = Memory.initRip(.none, 0) },
+                            },
                             .mov => {
                                 if (macho_sym.flags.is_extern_ptr) {
                                     const reg = ops[0].reg;
+                                    _ = lower.reloc(1, .{ .linker_reloc = sym_index }, 0);
                                     lower.result_insts[lower.result_insts_len] = try .new(.none, .mov, &.{
                                         .{ .reg = reg.to64() },
                                         .{ .mem = Memory.initRip(.qword, 0) },
@@ -534,6 +549,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
                                         .reg = reg.to64(),
                                     } }) };
                                 }
+                                _ = lower.reloc(@intCast(op_index), .{ .linker_reloc = sym_index }, 0);
                                 break :op .{ .mem = Memory.initRip(mem_op.sib.ptr_size, 0) };
                             },
                             else => unreachable,
@@ -550,7 +566,7 @@ fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand)
 }
 
 fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
-    @setEvalBranchQuota(2_400);
+    @setEvalBranchQuota(2_500);
     const fixes = switch (inst.ops) {
         .none => inst.data.none.fixes,
         .inst => inst.data.inst.fixes,
@@ -595,7 +611,7 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
     }, switch (inst.ops) {
         .none => &.{},
         .inst => &.{
-            .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }, 0) },
+            .{ .imm = lower.reloc(0, .{ .inst = inst.data.inst.inst }, 0) },
         },
         .i_s, .i_u => &.{
             .{ .imm = lower.imm(inst.ops, inst.data.i.i) },
@@ -642,10 +658,10 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .imm = lower.imm(inst.ops, inst.data.rri.i) },
         },
         .m => &.{
-            .{ .mem = lower.mem(inst.data.x.payload) },
+            .{ .mem = lower.mem(0, inst.data.x.payload) },
         },
         .mi_s, .mi_u => &.{
-            .{ .mem = lower.mem(inst.data.x.payload + 1) },
+            .{ .mem = lower.mem(0, inst.data.x.payload + 1) },
             .{ .imm = lower.imm(
                 inst.ops,
                 lower.mir.extraData(Mir.Imm32, inst.data.x.payload).data.imm,
@@ -653,64 +669,64 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
         },
         .rm => &.{
             .{ .reg = inst.data.rx.r1 },
-            .{ .mem = lower.mem(inst.data.rx.payload) },
+            .{ .mem = lower.mem(1, inst.data.rx.payload) },
         },
         .rmr => &.{
             .{ .reg = inst.data.rrx.r1 },
-            .{ .mem = lower.mem(inst.data.rrx.payload) },
+            .{ .mem = lower.mem(1, inst.data.rrx.payload) },
             .{ .reg = inst.data.rrx.r2 },
         },
         .rmi => &.{
             .{ .reg = inst.data.rix.r1 },
-            .{ .mem = lower.mem(inst.data.rix.payload) },
+            .{ .mem = lower.mem(1, inst.data.rix.payload) },
             .{ .imm = lower.imm(inst.ops, inst.data.rix.i) },
         },
         .rmi_s, .rmi_u => &.{
             .{ .reg = inst.data.rx.r1 },
-            .{ .mem = lower.mem(inst.data.rx.payload + 1) },
+            .{ .mem = lower.mem(1, inst.data.rx.payload + 1) },
             .{ .imm = lower.imm(
                 inst.ops,
                 lower.mir.extraData(Mir.Imm32, inst.data.rx.payload).data.imm,
             ) },
         },
         .mr => &.{
-            .{ .mem = lower.mem(inst.data.rx.payload) },
+            .{ .mem = lower.mem(0, inst.data.rx.payload) },
             .{ .reg = inst.data.rx.r1 },
         },
         .mrr => &.{
-            .{ .mem = lower.mem(inst.data.rrx.payload) },
+            .{ .mem = lower.mem(0, inst.data.rrx.payload) },
             .{ .reg = inst.data.rrx.r1 },
             .{ .reg = inst.data.rrx.r2 },
         },
         .mri => &.{
-            .{ .mem = lower.mem(inst.data.rix.payload) },
+            .{ .mem = lower.mem(0, inst.data.rix.payload) },
             .{ .reg = inst.data.rix.r1 },
             .{ .imm = lower.imm(inst.ops, inst.data.rix.i) },
         },
         .rrm => &.{
             .{ .reg = inst.data.rrx.r1 },
             .{ .reg = inst.data.rrx.r2 },
-            .{ .mem = lower.mem(inst.data.rrx.payload) },
+            .{ .mem = lower.mem(2, inst.data.rrx.payload) },
         },
         .rrmr => &.{
             .{ .reg = inst.data.rrrx.r1 },
             .{ .reg = inst.data.rrrx.r2 },
-            .{ .mem = lower.mem(inst.data.rrrx.payload) },
+            .{ .mem = lower.mem(2, inst.data.rrrx.payload) },
             .{ .reg = inst.data.rrrx.r3 },
         },
         .rrmi => &.{
             .{ .reg = inst.data.rrix.r1 },
             .{ .reg = inst.data.rrix.r2 },
-            .{ .mem = lower.mem(inst.data.rrix.payload) },
+            .{ .mem = lower.mem(2, inst.data.rrix.payload) },
             .{ .imm = lower.imm(inst.ops, inst.data.rrix.i) },
         },
         .extern_fn_reloc, .rel => &.{
-            .{ .imm = lower.reloc(.{ .linker_extern_fn = inst.data.reloc.sym_index }, inst.data.reloc.off) },
+            .{ .imm = lower.reloc(0, .{ .linker_extern_fn = inst.data.reloc.sym_index }, inst.data.reloc.off) },
         },
         .got_reloc, .direct_reloc, .import_reloc => ops: {
             const reg = inst.data.rx.r1;
             const extra = lower.mir.extraData(bits.SymbolOffset, inst.data.rx.payload).data;
-            _ = lower.reloc(switch (inst.ops) {
+            _ = lower.reloc(1, switch (inst.ops) {
                 .got_reloc => .{ .linker_got = extra.sym_index },
                 .direct_reloc => .{ .linker_direct = extra.sym_index },
                 .import_reloc => .{ .linker_import = extra.sym_index },
src/arch/x86_64/Mir.zig
@@ -100,6 +100,8 @@ pub const Inst = struct {
         /// ___ Division
         _d,
 
+        /// ___ Without Affecting Flags
+        _x,
         /// ___ Left
         _l,
         /// ___ Left Double
@@ -483,6 +485,7 @@ pub const Inst = struct {
         /// ASCII adjust al after subtraction
         aa,
         /// Add with carry
+        /// Unsigned integer addition of two operands with carry flag
         adc,
         /// Add
         /// Add packed integers
@@ -1162,10 +1165,8 @@ pub const Inst = struct {
         fmadd231,
 
         // ADX
-        /// Unsigned integer addition of two operands with carry flag
-        adcx,
         /// Unsigned integer addition of two operands with overflow flag
-        adox,
+        ado,
 
         // AESKLE
         /// Encode 128-bit key with key locker
test/behavior/x86_64/build.zig
@@ -93,6 +93,11 @@ pub fn build(b: *std.Build) void {
             .cpu_arch = .x86_64,
             .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
         },
+        .{
+            .cpu_arch = .x86_64,
+            .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v3 },
+            .cpu_features_add = std.Target.x86.featureSet(&.{.adx}),
+        },
         .{
             .cpu_arch = .x86_64,
             .cpu_model = .{ .explicit = &std.Target.x86.cpu.x86_64_v4 },
test/behavior/x86_64/math.zig
@@ -44,6 +44,17 @@ fn AddOneBit(comptime Type: type) type {
         .vector => |vector| @Vector(vector.len, ResultScalar),
     };
 }
+fn DoubleBits(comptime Type: type) type {
+    const ResultScalar = switch (@typeInfo(Scalar(Type))) {
+        .int => |int| @Type(.{ .int = .{ .signedness = int.signedness, .bits = int.bits * 2 } }),
+        .float => Scalar(Type),
+        else => @compileError(@typeName(Type)),
+    };
+    return switch (@typeInfo(Type)) {
+        else => ResultScalar,
+        .vector => |vector| @Vector(vector.len, ResultScalar),
+    };
+}
 // inline to avoid a runtime `@splat`
 inline fn splat(comptime Type: type, scalar: Scalar(Type)) Type {
     return switch (@typeInfo(Type)) {
@@ -16216,6 +16227,8 @@ fn binary(comptime op: anytype, comptime opts: struct { compare: Compare = .rela
             );
         }
         fn testInts() !void {
+            try testArgs(i4, 0x3, 0x2);
+            try testArgs(u4, 0xe, 0x6);
             try testArgs(i8, 0x48, 0x6c);
             try testArgs(u8, 0xbb, 0x43);
             try testArgs(i16, -0x0fdf, 0x302e);
@@ -18993,6 +19006,15 @@ test subUnsafe {
     try test_sub_unsafe.testFloatVectors();
 }
 
+inline fn mulUnsafe(comptime Type: type, lhs: Type, rhs: Type) DoubleBits(Type) {
+    @setRuntimeSafety(false);
+    return @as(DoubleBits(Type), lhs) * rhs;
+}
+test mulUnsafe {
+    const test_mul_unsafe = binary(mulUnsafe, .{});
+    try test_mul_unsafe.testInts();
+}
+
 inline fn multiply(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs * rhs) {
     if (@inComptime() and @typeInfo(Type) == .vector) {
         // workaround https://github.com/ziglang/zig/issues/22743