Commit 60cdacaff2

Jacob Young <jacobly0@users.noreply.github.com>
2025-09-11 14:40:17
x86_64: rewrite vector element pointer access
1 parent 2ba03e9
Changed files (1)
src
arch
src/arch/x86_64/CodeGen.zig
@@ -2291,7 +2291,7 @@ fn genBodyBlock(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
 }
 
 fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
-    @setEvalBranchQuota(29_400);
+    @setEvalBranchQuota(29_500);
     const pt = cg.pt;
     const zcu = pt.zcu;
     const ip = &zcu.intern_pool;
@@ -86774,52 +86774,313 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 const is_non_err = try cg.tempInit(.bool, .{ .eflags = .e });
                 try is_non_err.finish(inst, &.{un_op}, &ops, cg);
             },
-            .load => fallback: {
+            .load => {
                 const ty_op = air_datas[@intFromEnum(inst)].ty_op;
                 const val_ty = ty_op.ty.toType();
-                const ptr_ty = cg.typeOf(ty_op.operand);
-                const ptr_info = ptr_ty.ptrInfo(zcu);
-                if (ptr_info.packed_offset.host_size > 0 and
-                    (ptr_info.flags.vector_index == .none or val_ty.toIntern() == .bool_type))
-                    break :fallback try cg.airLoad(inst);
                 var ops = try cg.tempsFromOperands(inst, .{ty_op.operand});
-                const res = try ops[0].load(val_ty, .{
-                    .disp = switch (ptr_info.flags.vector_index) {
-                        .none => 0,
-                        .runtime => unreachable,
-                        else => |vector_index| @intCast(val_ty.abiSize(zcu) * @intFromEnum(vector_index)),
+                var res: [1]Temp = undefined;
+                cg.select(&res, &.{val_ty}, &ops, comptime &.{ .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .byte }, .any, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .none, .none } },
                     },
-                }, cg);
-                try res.finish(inst, &.{ty_op.operand}, &ops, cg);
+                    .extra_temps = .{
+                        .{ .type = .u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .general_purpose } } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .dst_temps = .{ .{ .cc = .c }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+                        .{ ._, ._, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .word }, .any, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .none, .none } },
+                    },
+                    .dst_temps = .{ .{ .cc = .c }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .bt, .lea(.src0w), .ua(.src0, .add_vector_index), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .ptr_any_bool_vec_elem, .any, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .none, .none } },
+                    },
+                    .dst_temps = .{ .{ .cc = .c }, .unused },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .bt, .leaa(.src0d, .add_vector_index_div_8_down_4), .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                    } },
+                } }) catch |err| switch (err) {
+                    error.SelectFailed => res[0] = try ops[0].load(val_ty, .{
+                        .disp = switch (cg.typeOf(ty_op.operand).ptrInfo(zcu).flags.vector_index) {
+                            .none => 0,
+                            .runtime => unreachable,
+                            else => |vector_index| @intCast(val_ty.abiSize(zcu) * @intFromEnum(vector_index)),
+                        },
+                    }, cg),
+                    else => |e| return e,
+                };
+                try res[0].finish(inst, &.{ty_op.operand}, &ops, cg);
             },
             .ret => try cg.airRet(inst, false),
             .ret_safe => try cg.airRet(inst, true),
             .ret_load => try cg.airRetLoad(inst),
-            .store, .store_safe => |air_tag| fallback: {
+            .store, .store_safe => |air_tag| {
                 const bin_op = air_datas[@intFromEnum(inst)].bin_op;
-                const ptr_ty = cg.typeOf(bin_op.lhs);
-                const ptr_info = ptr_ty.ptrInfo(zcu);
-                const val_ty = cg.typeOf(bin_op.rhs);
-                if (ptr_info.packed_offset.host_size > 0 and
-                    (ptr_info.flags.vector_index == .none or val_ty.toIntern() == .bool_type))
-                    break :fallback try cg.airStore(inst, switch (air_tag) {
-                        else => unreachable,
-                        .store => false,
-                        .store_safe => true,
-                    });
                 var ops = try cg.tempsFromOperands(inst, .{ bin_op.lhs, bin_op.rhs });
-                try ops[0].store(&ops[1], .{
-                    .disp = switch (ptr_info.flags.vector_index) {
-                        .none => 0,
-                        .runtime => unreachable,
-                        else => |vector_index| @intCast(val_ty.abiSize(zcu) * @intFromEnum(vector_index)),
+                cg.select(&.{}, &.{}, &ops, comptime &.{ .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .byte }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .{ .imm = 0 }, .none } },
                     },
-                    .safe = switch (air_tag) {
-                        else => unreachable,
-                        .store => false,
-                        .store_safe => true,
+                    .extra_temps = .{
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
                     },
-                }, cg);
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+                        .{ ._, ._r, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .byte }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .{ .imm = 1 }, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+                        .{ ._, ._s, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, null, null, null },
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .byte }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._r, .bt, .tmp1d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._s, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._, .@"test", .src1b, .si(1), ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .byte }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u8, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
+                        .{ ._, ._, .@"test", .src1b, .si(1), ._, ._ },
+                        .{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
+                        .{ ._, ._r, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
+                        .{ .@"0:", ._s, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ .@"1:", ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .word }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .{ .imm = 0 }, .none } },
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bt, .lea(.src0w), .ua(.src0, .add_vector_index), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .word }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .{ .imm = 1 }, .none } },
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._s, .bt, .lea(.src0w), .ua(.src0, .add_vector_index), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, null, null, null },
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .word }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .movzx, .tmp0d, .lea(.src0w), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._r, .bt, .tmp1d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._s, .bt, .tmp0d, .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._, .@"test", .src1b, .si(1), ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .lea(.src0w), .tmp0w, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .{ .ptr_bool_vec_elem = .word }, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"test", .src1b, .si(1), ._, ._ },
+                        .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._r, .bt, .lea(.src0w), .ua(.src0, .add_vector_index), ._, ._ },
+                        .{ ._, ._mp, .j, .@"0f", ._, ._, ._ },
+                        .{ .@"1:", ._s, .bt, .lea(.src0w), .ua(.src0, .add_vector_index), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .ptr_any_bool_vec_elem, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .{ .imm = 0 }, .none } },
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._r, .bt, .leaa(.src0d, .add_vector_index_div_8_down_4), .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .ptr_any_bool_vec_elem, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .{ .imm = 1 }, .none } },
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._s, .bt, .leaa(.src0d, .add_vector_index_div_8_down_4), .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .cmov, null, null, null },
+                    .src_constraints = .{ .ptr_any_bool_vec_elem, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .{ .type = .u32, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .mov, .tmp0d, .leaa(.src0d, .add_vector_index_div_8_down_4), ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
+                        .{ ._, ._r, .bt, .tmp1d, .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                        .{ ._, ._s, .bt, .tmp0d, .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                        .{ ._, ._, .@"test", .src1b, .si(1), ._, ._ },
+                        .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
+                        .{ ._, ._, .mov, .leaa(.src0d, .add_vector_index_div_8_down_4), .tmp0d, ._, ._ },
+                    } },
+                }, .{
+                    .src_constraints = .{ .ptr_any_bool_vec_elem, .bool, .any },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .none } },
+                    },
+                    .clobbers = .{ .eflags = true },
+                    .each = .{ .once = &.{
+                        .{ ._, ._, .@"test", .src1b, .si(1), ._, ._ },
+                        .{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
+                        .{ ._, ._r, .bt, .leaa(.src0d, .add_vector_index_div_8_down_4), .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                        .{ ._, ._mp, .j, .@"0f", ._, ._, ._ },
+                        .{ .@"1:", ._s, .bt, .leaa(.src0d, .add_vector_index_div_8_down_4), .ua(.src0, .add_vector_index_rem_32), ._, ._ },
+                    } },
+                } }) catch |err| switch (err) {
+                    error.SelectFailed => try ops[0].store(&ops[1], .{
+                        .disp = switch (cg.typeOf(bin_op.lhs).ptrInfo(zcu).flags.vector_index) {
+                            .none => 0,
+                            .runtime => unreachable,
+                            else => |vector_index| @intCast(cg.typeOf(bin_op.rhs).abiSize(zcu) * @intFromEnum(vector_index)),
+                        },
+                        .safe = switch (air_tag) {
+                            else => unreachable,
+                            .store => false,
+                            .store_safe => true,
+                        },
+                    }, cg),
+                    else => |e| return e,
+                };
                 for (ops) |op| try op.die(cg);
             },
             .unreach => {},
@@ -100863,7 +101124,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .cc = .c }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bt, .src0d, .ua(.none, .add_src1_rem_32), ._, ._ },
+                        .{ ._, ._, .bt, .src0d, .ua(.none, .add_src1), ._, ._ },
                     } },
                 }, .{
                     .src_constraints = .{ .{ .bool_vec = .dword }, .any, .any },
@@ -100884,7 +101145,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .dst_temps = .{ .{ .cc = .c }, .unused },
                     .clobbers = .{ .eflags = true },
                     .each = .{ .once = &.{
-                        .{ ._, ._, .bt, .src0q, .ua(.none, .add_src1_rem_64), ._, ._ },
+                        .{ ._, ._, .bt, .src0q, .ua(.none, .add_src1), ._, ._ },
                     } },
                 }, .{
                     .required_features = .{ .@"64bit", null, null, null },
@@ -174481,114 +174742,6 @@ fn reuseOperandAdvanced(
     return true;
 }
 
-fn packedLoad(self: *CodeGen, dst_mcv: MCValue, ptr_ty: Type, ptr_mcv: MCValue) InnerError!void {
-    const pt = self.pt;
-    const zcu = pt.zcu;
-
-    const ptr_info = ptr_ty.ptrInfo(zcu);
-    const val_ty: Type = .fromInterned(ptr_info.child);
-    if (!val_ty.hasRuntimeBitsIgnoreComptime(zcu)) return;
-    const val_abi_size: u32 = @intCast(val_ty.abiSize(zcu));
-
-    const val_bit_size: u32 = @intCast(val_ty.bitSize(zcu));
-    const ptr_bit_off = ptr_info.packed_offset.bit_offset + switch (ptr_info.flags.vector_index) {
-        .none => 0,
-        .runtime => unreachable,
-        else => |vector_index| @intFromEnum(vector_index) * val_bit_size,
-    };
-    if (ptr_bit_off % 8 == 0) {
-        {
-            const mat_ptr_mcv: MCValue = switch (ptr_mcv) {
-                .immediate, .register, .register_offset, .lea_frame => ptr_mcv,
-                else => .{ .register = try self.copyToTmpRegister(ptr_ty, ptr_mcv) },
-            };
-            const mat_ptr_lock = switch (mat_ptr_mcv) {
-                .register => |mat_ptr_reg| self.register_manager.lockReg(mat_ptr_reg),
-                else => null,
-            };
-            defer if (mat_ptr_lock) |lock| self.register_manager.unlockReg(lock);
-
-            try self.load(dst_mcv, ptr_ty, mat_ptr_mcv.offset(@intCast(@divExact(ptr_bit_off, 8))));
-        }
-
-        if (val_abi_size * 8 > val_bit_size) {
-            if (dst_mcv.isRegister()) {
-                try self.truncateRegister(val_ty, dst_mcv.getReg().?);
-            } else {
-                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-                defer self.register_manager.unlockReg(tmp_lock);
-
-                const hi_mcv = dst_mcv.address().offset(@intCast(val_bit_size / 64 * 8)).deref();
-                try self.genSetReg(tmp_reg, .usize, hi_mcv, .{});
-                try self.truncateRegister(val_ty, tmp_reg);
-                try self.genCopy(.usize, hi_mcv, .{ .register = tmp_reg }, .{});
-            }
-        }
-        return;
-    }
-
-    if (val_abi_size > 8) return self.fail("TODO implement packed load of {f}", .{val_ty.fmt(pt)});
-
-    const limb_abi_size: u31 = @min(val_abi_size, 8);
-    const limb_abi_bits = limb_abi_size * 8;
-    const val_byte_off: i32 = @intCast(ptr_bit_off / limb_abi_bits * limb_abi_size);
-    const val_bit_off = ptr_bit_off % limb_abi_bits;
-    const val_extra_bits = self.regExtraBits(val_ty);
-
-    const ptr_reg = try self.copyToTmpRegister(ptr_ty, ptr_mcv);
-    const ptr_lock = self.register_manager.lockRegAssumeUnused(ptr_reg);
-    defer self.register_manager.unlockReg(ptr_lock);
-
-    const dst_reg = switch (dst_mcv) {
-        .register => |reg| reg,
-        else => try self.register_manager.allocReg(null, abi.RegisterClass.gp),
-    };
-    const dst_lock = self.register_manager.lockReg(dst_reg);
-    defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
-
-    const load_abi_size =
-        if (val_bit_off < val_extra_bits) val_abi_size else val_abi_size * 2;
-    if (load_abi_size <= 8) {
-        const load_reg = registerAlias(dst_reg, load_abi_size);
-        try self.asmRegisterMemory(.{ ._, .mov }, load_reg, .{
-            .base = .{ .reg = ptr_reg },
-            .mod = .{ .rm = .{
-                .size = .fromSize(load_abi_size),
-                .disp = val_byte_off,
-            } },
-        });
-        try self.spillEflagsIfOccupied();
-        try self.asmRegisterImmediate(.{ ._r, .sh }, load_reg, .u(val_bit_off));
-    } else {
-        const tmp_reg =
-            registerAlias(try self.register_manager.allocReg(null, abi.RegisterClass.gp), val_abi_size);
-        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-        defer self.register_manager.unlockReg(tmp_lock);
-
-        const dst_alias = registerAlias(dst_reg, val_abi_size);
-        try self.asmRegisterMemory(.{ ._, .mov }, dst_alias, .{
-            .base = .{ .reg = ptr_reg },
-            .mod = .{ .rm = .{
-                .size = .fromSize(val_abi_size),
-                .disp = val_byte_off,
-            } },
-        });
-        try self.asmRegisterMemory(.{ ._, .mov }, tmp_reg, .{
-            .base = .{ .reg = ptr_reg },
-            .mod = .{ .rm = .{
-                .size = .fromSize(val_abi_size),
-                .disp = val_byte_off + limb_abi_size,
-            } },
-        });
-        try self.spillEflagsIfOccupied();
-        try self.asmRegisterRegisterImmediate(.{ ._rd, .sh }, dst_alias, tmp_reg, .u(val_bit_off));
-    }
-
-    if (val_extra_bits > 0) try self.truncateRegister(val_ty, dst_reg);
-    try self.genCopy(val_ty, dst_mcv, .{ .register = dst_reg }, .{});
-}
-
 fn load(self: *CodeGen, dst_mcv: MCValue, ptr_ty: Type, ptr_mcv: MCValue) InnerError!void {
     const pt = self.pt;
     const zcu = pt.zcu;
@@ -174636,174 +174789,6 @@ fn load(self: *CodeGen, dst_mcv: MCValue, ptr_ty: Type, ptr_mcv: MCValue) InnerE
     }
 }
 
-fn airLoad(self: *CodeGen, inst: Air.Inst.Index) !void {
-    const pt = self.pt;
-    const zcu = pt.zcu;
-    const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const elem_ty = self.typeOfIndex(inst);
-    const result: MCValue = result: {
-        if (!elem_ty.hasRuntimeBitsIgnoreComptime(zcu)) break :result .none;
-
-        try self.spillRegisters(&.{ .rdi, .rsi, .rcx });
-        const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rdi, .rsi, .rcx });
-        defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
-
-        const ptr_ty = self.typeOf(ty_op.operand);
-        const elem_size = elem_ty.abiSize(zcu);
-
-        const elem_rs = self.regSetForType(elem_ty);
-        const ptr_rs = self.regSetForType(ptr_ty);
-
-        const ptr_mcv = try self.resolveInst(ty_op.operand);
-        const dst_mcv = if (elem_size <= 8 and std.math.isPowerOfTwo(elem_size) and
-            elem_rs.supersetOf(ptr_rs) and self.reuseOperand(inst, ty_op.operand, 0, ptr_mcv))
-            // The MCValue that holds the pointer can be re-used as the value.
-            ptr_mcv
-        else
-            try self.allocRegOrMem(inst, true);
-
-        const ptr_info = ptr_ty.ptrInfo(zcu);
-        if (ptr_info.flags.vector_index != .none or ptr_info.packed_offset.host_size > 0) {
-            try self.packedLoad(dst_mcv, ptr_ty, ptr_mcv);
-        } else {
-            try self.load(dst_mcv, ptr_ty, ptr_mcv);
-        }
-
-        if (elem_ty.isAbiInt(zcu) and elem_size * 8 > elem_ty.bitSize(zcu)) {
-            const high_mcv: MCValue = switch (dst_mcv) {
-                .register => |dst_reg| .{ .register = dst_reg },
-                .register_pair => |dst_regs| .{ .register = dst_regs[1] },
-                else => dst_mcv.address().offset(@intCast((elem_size - 1) / 8 * 8)).deref(),
-            };
-            const high_reg = if (high_mcv.isRegister())
-                high_mcv.getReg().?
-            else
-                try self.copyToTmpRegister(.usize, high_mcv);
-            const high_lock = self.register_manager.lockReg(high_reg);
-            defer if (high_lock) |lock| self.register_manager.unlockReg(lock);
-
-            try self.truncateRegister(elem_ty, high_reg);
-            if (!high_mcv.isRegister()) try self.genCopy(
-                if (elem_size <= 8) elem_ty else .usize,
-                high_mcv,
-                .{ .register = high_reg },
-                .{},
-            );
-        }
-        break :result dst_mcv;
-    };
-    return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
-}
-
-fn packedStore(self: *CodeGen, ptr_ty: Type, ptr_mcv: MCValue, src_mcv: MCValue) InnerError!void {
-    const pt = self.pt;
-    const zcu = pt.zcu;
-    const ptr_info = ptr_ty.ptrInfo(zcu);
-    const src_ty: Type = .fromInterned(ptr_info.child);
-    if (!src_ty.hasRuntimeBitsIgnoreComptime(zcu)) return;
-
-    const limb_abi_size: u16 = @min(ptr_info.packed_offset.host_size, 8);
-    const limb_abi_bits = limb_abi_size * 8;
-    const limb_ty = try pt.intType(.unsigned, limb_abi_bits);
-
-    const src_bit_size = src_ty.bitSize(zcu);
-    const ptr_bit_off = ptr_info.packed_offset.bit_offset + switch (ptr_info.flags.vector_index) {
-        .none => 0,
-        .runtime => unreachable,
-        else => |vector_index| @intFromEnum(vector_index) * src_bit_size,
-    };
-    const src_byte_off: i32 = @intCast(ptr_bit_off / limb_abi_bits * limb_abi_size);
-    const src_bit_off = ptr_bit_off % limb_abi_bits;
-
-    const ptr_reg = try self.copyToTmpRegister(ptr_ty, ptr_mcv);
-    const ptr_lock = self.register_manager.lockRegAssumeUnused(ptr_reg);
-    defer self.register_manager.unlockReg(ptr_lock);
-
-    const mat_src_mcv: MCValue = mat_src_mcv: switch (src_mcv) {
-        .register => if (src_bit_size > 64) {
-            const frame_index = try self.allocFrameIndex(.initSpill(src_ty, self.pt.zcu));
-            try self.genSetMem(.{ .frame = frame_index }, 0, src_ty, src_mcv, .{});
-            break :mat_src_mcv .{ .load_frame = .{ .index = frame_index } };
-        } else src_mcv,
-        else => src_mcv,
-    };
-
-    var limb_i: u16 = 0;
-    while (limb_i * limb_abi_bits < src_bit_off + src_bit_size) : (limb_i += 1) {
-        const part_bit_off = if (limb_i == 0) src_bit_off else 0;
-        const part_bit_size =
-            @min(src_bit_off + src_bit_size - limb_i * limb_abi_bits, limb_abi_bits) - part_bit_off;
-        const limb_mem: Memory = .{
-            .base = .{ .reg = ptr_reg },
-            .mod = .{ .rm = .{
-                .size = .fromSize(limb_abi_size),
-                .disp = src_byte_off + limb_i * limb_abi_size,
-            } },
-        };
-
-        const part_mask = (@as(u64, std.math.maxInt(u64)) >> @intCast(64 - part_bit_size)) <<
-            @intCast(part_bit_off);
-        const part_mask_not = part_mask ^ (@as(u64, std.math.maxInt(u64)) >> @intCast(64 - limb_abi_bits));
-        if (limb_abi_size <= 4) {
-            try self.asmMemoryImmediate(.{ ._, .@"and" }, limb_mem, .u(part_mask_not));
-        } else if (std.math.cast(i32, @as(i64, @bitCast(part_mask_not)))) |small| {
-            try self.asmMemoryImmediate(.{ ._, .@"and" }, limb_mem, .s(small));
-        } else {
-            const part_mask_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-            try self.asmRegisterImmediate(.{ ._, .mov }, part_mask_reg, .u(part_mask_not));
-            try self.asmMemoryRegister(.{ ._, .@"and" }, limb_mem, part_mask_reg);
-        }
-
-        if (src_bit_size <= 64) {
-            const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-            const tmp_mcv = MCValue{ .register = tmp_reg };
-            const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-            defer self.register_manager.unlockReg(tmp_lock);
-
-            try self.genSetReg(tmp_reg, limb_ty, mat_src_mcv, .{});
-            switch (limb_i) {
-                0 => try self.genShiftBinOpMir(
-                    .{ ._l, .sh },
-                    limb_ty,
-                    tmp_mcv,
-                    .u8,
-                    .{ .immediate = src_bit_off },
-                ),
-                1 => try self.genShiftBinOpMir(
-                    .{ ._r, .sh },
-                    limb_ty,
-                    tmp_mcv,
-                    .u8,
-                    .{ .immediate = limb_abi_bits - src_bit_off },
-                ),
-                else => unreachable,
-            }
-            try self.genBinOpMir(.{ ._, .@"and" }, limb_ty, tmp_mcv, .{ .immediate = part_mask });
-            try self.asmMemoryRegister(
-                .{ ._, .@"or" },
-                limb_mem,
-                registerAlias(tmp_reg, limb_abi_size),
-            );
-        } else if (src_bit_size <= 128 and src_bit_off == 0) {
-            const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-            const tmp_mcv = MCValue{ .register = tmp_reg };
-            const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-            defer self.register_manager.unlockReg(tmp_lock);
-
-            try self.genSetReg(tmp_reg, limb_ty, switch (limb_i) {
-                0 => mat_src_mcv,
-                else => mat_src_mcv.address().offset(limb_i * limb_abi_size).deref(),
-            }, .{});
-            try self.genBinOpMir(.{ ._, .@"and" }, limb_ty, tmp_mcv, .{ .immediate = part_mask });
-            try self.asmMemoryRegister(
-                .{ ._, .@"or" },
-                limb_mem,
-                registerAlias(tmp_reg, limb_abi_size),
-            );
-        } else return self.fail("TODO: implement packed store of {f}", .{src_ty.fmt(pt)});
-    }
-}
-
 fn store(
     self: *CodeGen,
     ptr_ty: Type,
@@ -174857,35 +174842,6 @@ fn store(
     }
 }
 
-fn airStore(self: *CodeGen, inst: Air.Inst.Index, safety: bool) !void {
-    const pt = self.pt;
-    const zcu = pt.zcu;
-    const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-
-    result: {
-        if (!safety and (try self.resolveInst(bin_op.rhs)) == .undef) break :result;
-
-        try self.spillRegisters(&.{ .rdi, .rsi, .rcx });
-        const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rdi, .rsi, .rcx });
-        defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
-
-        const ptr_ty = self.typeOf(bin_op.lhs);
-        const ptr_info = ptr_ty.ptrInfo(zcu);
-        const is_packed = ptr_info.flags.vector_index != .none or ptr_info.packed_offset.host_size > 0;
-        if (is_packed) try self.spillEflagsIfOccupied();
-
-        const src_mcv = try self.resolveInst(bin_op.rhs);
-        const ptr_mcv = try self.resolveInst(bin_op.lhs);
-
-        if (is_packed) {
-            try self.packedStore(ptr_ty, ptr_mcv, src_mcv);
-        } else {
-            try self.store(ptr_ty, ptr_mcv, src_mcv, .{ .safety = safety });
-        }
-    }
-    return self.finishAir(inst, .none, .{ bin_op.lhs, bin_op.rhs, .none });
-}
-
 fn genUnOp(self: *CodeGen, maybe_inst: ?Air.Inst.Index, tag: Air.Inst.Tag, src_air: Air.Inst.Ref) !MCValue {
     const pt = self.pt;
     const zcu = pt.zcu;
@@ -192171,6 +192127,8 @@ const Select = struct {
         exact_bool_vec: u16,
         ptr_any_bool_vec,
         ptr_bool_vec: Memory.Size,
+        ptr_any_bool_vec_elem,
+        ptr_bool_vec_elem: Memory.Size,
         remainder_bool_vec: OfIsSizes,
         exact_remainder_bool_vec: struct { of: Memory.Size, is: u16 },
         signed_int_vec: Memory.Size,
@@ -192273,6 +192231,22 @@ const Select = struct {
                     .vector_type => |vector_type| vector_type.child == .bool_type and size.bitSize(cg.target) >= vector_type.len,
                     else => false,
                 },
+                .ptr_any_bool_vec_elem => {
+                    const ptr_info = ty.ptrInfo(zcu);
+                    return switch (ptr_info.flags.vector_index) {
+                        .none => false,
+                        .runtime => unreachable,
+                        else => ptr_info.child == .bool_type,
+                    };
+                },
+                .ptr_bool_vec_elem => |size| {
+                    const ptr_info = ty.ptrInfo(zcu);
+                    return switch (ptr_info.flags.vector_index) {
+                        .none => false,
+                        .runtime => unreachable,
+                        else => ptr_info.child == .bool_type and size.bitSize(cg.target) >= ptr_info.packed_offset.host_size,
+                    };
+                },
                 .remainder_bool_vec => |of_is| ty.isVector(zcu) and ty.scalarType(zcu).toIntern() == .bool_type and
                     of_is.is.bitSize(cg.target) >= (ty.vectorLen(zcu) - 1) % of_is.of.bitSize(cg.target) + 1,
                 .exact_remainder_bool_vec => |of_is| ty.isVector(zcu) and ty.scalarType(zcu).toIntern() == .bool_type and
@@ -193266,7 +193240,7 @@ const Select = struct {
                 ref: Ref,
                 scale: Memory.Scale = .@"1",
             } = .{ .ref = .none },
-            unused: u3 = 0,
+            unused: u2 = 0,
         },
         imm: i32 = 0,
 
@@ -193279,9 +193253,9 @@ const Select = struct {
             lea,
             mem,
         };
-        const Adjust = packed struct(u10) {
+        const Adjust = packed struct(u11) {
             sign: enum(u1) { neg, pos },
-            lhs: enum(u5) {
+            lhs: enum(u6) {
                 none,
                 ptr_size,
                 ptr_bit_size,
@@ -193303,6 +193277,7 @@ const Select = struct {
                 src0_elem_size,
                 dst0_elem_size,
                 src0_elem_size_mul_src1,
+                vector_index,
                 src1,
                 src1_sub_bit_size,
                 log2_src0_elem_size,
@@ -193373,9 +193348,13 @@ const Select = struct {
             const sub_src0_elem_size: Adjust = .{ .sign = .neg, .lhs = .src0_elem_size, .op = .mul, .rhs = .@"1" };
             const add_src0_elem_size_mul_src1: Adjust = .{ .sign = .pos, .lhs = .src0_elem_size_mul_src1, .op = .mul, .rhs = .@"1" };
             const sub_src0_elem_size_mul_src1: Adjust = .{ .sign = .neg, .lhs = .src0_elem_size_mul_src1, .op = .mul, .rhs = .@"1" };
+            const add_vector_index: Adjust = .{ .sign = .pos, .lhs = .vector_index, .op = .mul, .rhs = .@"1" };
+            const add_vector_index_rem_32: Adjust = .{ .sign = .pos, .lhs = .vector_index, .op = .rem_8_mul, .rhs = .@"4" };
+            const add_vector_index_div_8_down_4: Adjust = .{ .sign = .pos, .lhs = .vector_index, .op = .div_8_down, .rhs = .@"4" };
             const add_dst0_elem_size: Adjust = .{ .sign = .pos, .lhs = .dst0_elem_size, .op = .mul, .rhs = .@"1" };
             const sub_dst0_elem_size: Adjust = .{ .sign = .neg, .lhs = .dst0_elem_size, .op = .mul, .rhs = .@"1" };
             const add_src1_div_8_down_4: Adjust = .{ .sign = .pos, .lhs = .src1, .op = .div_8_down, .rhs = .@"4" };
+            const add_src1: Adjust = .{ .sign = .pos, .lhs = .src1, .op = .mul, .rhs = .@"1" };
             const add_src1_rem_32: Adjust = .{ .sign = .pos, .lhs = .src1, .op = .rem_8_mul, .rhs = .@"4" };
             const add_src1_rem_64: Adjust = .{ .sign = .pos, .lhs = .src1, .op = .rem_8_mul, .rhs = .@"8" };
             const add_src1_sub_bit_size: Adjust = .{ .sign = .pos, .lhs = .src1_sub_bit_size, .op = .mul, .rhs = .@"1" };
@@ -194258,6 +194237,10 @@ const Select = struct {
                 .dst0_elem_size => @intCast(Select.Operand.Ref.dst0.typeOf(s).elemType2(s.cg.pt.zcu).abiSize(s.cg.pt.zcu)),
                 .src0_elem_size_mul_src1 => @intCast(Select.Operand.Ref.src0.typeOf(s).elemType2(s.cg.pt.zcu).abiSize(s.cg.pt.zcu) *
                     Select.Operand.Ref.src1.valueOf(s).immediate),
+                .vector_index => switch (op.flags.base.ref.typeOf(s).ptrInfo(s.cg.pt.zcu).flags.vector_index) {
+                    .none, .runtime => unreachable,
+                    else => |vector_index| @intFromEnum(vector_index),
+                },
                 .src1 => @intCast(Select.Operand.Ref.src1.valueOf(s).immediate),
                 .src1_sub_bit_size => @as(SignedImm, @intCast(Select.Operand.Ref.src1.valueOf(s).immediate)) -
                     @as(SignedImm, @intCast(s.cg.nonBoolScalarBitSize(op.flags.base.ref.typeOf(s)))),