Commit c1e9ef9eaa

Jacob Young <jacobly0@users.noreply.github.com>
2025-05-29 12:11:28
Legalize: implement scalarization of unary operations
1 parent c907866
Changed files (8)
src/Air/Legalize.zig
@@ -1,21 +1,48 @@
-zcu: *const Zcu,
-air: Air,
-features: std.enums.EnumSet(Feature),
+pt: Zcu.PerThread,
+air_instructions: std.MultiArrayList(Air.Inst),
+air_extra: std.ArrayListUnmanaged(u32),
+features: *const Features,
 
 pub const Feature = enum {
+    scalarize_not,
+    scalarize_clz,
+    scalarize_ctz,
+    scalarize_popcount,
+    scalarize_byte_swap,
+    scalarize_bit_reverse,
+    scalarize_sqrt,
+    scalarize_sin,
+    scalarize_cos,
+    scalarize_tan,
+    scalarize_exp,
+    scalarize_exp2,
+    scalarize_log,
+    scalarize_log2,
+    scalarize_log10,
+    scalarize_abs,
+    scalarize_floor,
+    scalarize_ceil,
+    scalarize_round,
+    scalarize_trunc_float,
+    scalarize_neg,
+    scalarize_neg_optimized,
+
     /// Legalize (shift lhs, (splat rhs)) -> (shift lhs, rhs)
     remove_shift_vector_rhs_splat,
     /// Legalize reduce of a one element vector to a bitcast
     reduce_one_elem_to_bitcast,
 };
 
-pub const Features = std.enums.EnumFieldStruct(Feature, bool, false);
+pub const Features = std.enums.EnumSet(Feature);
+
+pub const Error = std.mem.Allocator.Error;
 
-pub fn legalize(air: *Air, backend: std.builtin.CompilerBackend, zcu: *const Zcu) std.mem.Allocator.Error!void {
+pub fn legalize(air: *Air, backend: std.builtin.CompilerBackend, pt: Zcu.PerThread) Error!void {
     var l: Legalize = .{
-        .zcu = zcu,
-        .air = air.*,
-        .features = features: switch (backend) {
+        .pt = pt,
+        .air_instructions = air.instructions.toMultiArrayList(),
+        .air_extra = air.extra,
+        .features = &features: switch (backend) {
             .other, .stage1 => unreachable,
             inline .stage2_llvm,
             .stage2_c,
@@ -30,118 +57,365 @@ pub fn legalize(air: *Air, backend: std.builtin.CompilerBackend, zcu: *const Zcu
             .stage2_powerpc,
             => |ct_backend| {
                 const Backend = codegen.importBackend(ct_backend) orelse break :features .initEmpty();
-                break :features if (@hasDecl(Backend, "legalize_features"))
-                    .init(Backend.legalize_features)
-                else
-                    .initEmpty();
+                break :features if (@hasDecl(Backend, "legalize_features")) Backend.legalize_features else .initEmpty();
             },
             _ => unreachable,
         },
     };
-    defer air.* = l.air;
-    if (!l.features.bits.eql(.initEmpty())) try l.legalizeBody(l.air.getMainBody());
+    if (l.features.bits.eql(.initEmpty())) return;
+    defer air.* = l.getTmpAir();
+    const main_extra = l.extraData(Air.Block, l.air_extra.items[@intFromEnum(Air.ExtraIndex.main_block)]);
+    try l.legalizeBody(main_extra.end, main_extra.data.body_len);
+}
+
+fn getTmpAir(l: *const Legalize) Air {
+    return .{
+        .instructions = l.air_instructions.slice(),
+        .extra = l.air_extra,
+    };
+}
+
+fn typeOf(l: *const Legalize, ref: Air.Inst.Ref) Type {
+    return l.getTmpAir().typeOf(ref, &l.pt.zcu.intern_pool);
+}
+
+fn typeOfIndex(l: *const Legalize, inst: Air.Inst.Index) Type {
+    return l.getTmpAir().typeOfIndex(inst, &l.pt.zcu.intern_pool);
 }
 
-fn legalizeBody(l: *Legalize, body: []const Air.Inst.Index) std.mem.Allocator.Error!void {
-    const zcu = l.zcu;
+fn extraData(l: *const Legalize, comptime T: type, index: usize) @TypeOf(Air.extraData(undefined, T, undefined)) {
+    return l.getTmpAir().extraData(T, index);
+}
+
+fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
+    const zcu = l.pt.zcu;
     const ip = &zcu.intern_pool;
-    const tags = l.air.instructions.items(.tag);
-    const data = l.air.instructions.items(.data);
-    for (body) |inst| inst: switch (tags[@intFromEnum(inst)]) {
-        else => {},
-
-        .shl,
-        .shl_exact,
-        .shl_sat,
-        .shr,
-        .shr_exact,
-        => |air_tag| if (l.features.contains(.remove_shift_vector_rhs_splat)) done: {
-            const bin_op = data[@intFromEnum(inst)].bin_op;
-            const ty = l.air.typeOf(bin_op.rhs, ip);
-            if (!ty.isVector(zcu)) break :done;
-            if (bin_op.rhs.toInterned()) |rhs_ip_index| switch (ip.indexToKey(rhs_ip_index)) {
-                else => {},
-                .aggregate => |aggregate| switch (aggregate.storage) {
-                    else => {},
-                    .repeated_elem => |splat| continue :inst l.replaceInst(inst, air_tag, .{ .bin_op = .{
-                        .lhs = bin_op.lhs,
-                        .rhs = Air.internedToRef(splat),
-                    } }),
-                },
-            } else {
-                const rhs_inst = bin_op.rhs.toIndex().?;
-                switch (tags[@intFromEnum(rhs_inst)]) {
+    for (body_start..body_start + body_len) |inst_extra_index| {
+        const inst: Air.Inst.Index = @enumFromInt(l.air_extra.items[inst_extra_index]);
+        inst: switch (l.air_instructions.items(.tag)[@intFromEnum(inst)]) {
+            else => {},
+
+            inline .not,
+            .clz,
+            .ctz,
+            .popcount,
+            .byte_swap,
+            .bit_reverse,
+            .abs,
+            => |air_tag| if (l.features.contains(@field(Feature, "scalarize_" ++ @tagName(air_tag)))) done: {
+                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
+                if (!ty_op.ty.toType().isVector(zcu)) break :done;
+                continue :inst try l.scalarizeUnary(inst, .ty_op, ty_op.operand);
+            },
+            inline .sqrt,
+            .sin,
+            .cos,
+            .tan,
+            .exp,
+            .exp2,
+            .log,
+            .log2,
+            .log10,
+            .floor,
+            .ceil,
+            .round,
+            .trunc_float,
+            .neg,
+            .neg_optimized,
+            => |air_tag| if (l.features.contains(@field(Feature, "scalarize_" ++ @tagName(air_tag)))) done: {
+                const un_op = l.air_instructions.items(.data)[@intFromEnum(inst)].un_op;
+                if (!l.typeOf(un_op).isVector(zcu)) break :done;
+                continue :inst try l.scalarizeUnary(inst, .un_op, un_op);
+            },
+
+            .shl,
+            .shl_exact,
+            .shl_sat,
+            .shr,
+            .shr_exact,
+            => |air_tag| if (l.features.contains(.remove_shift_vector_rhs_splat)) done: {
+                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
+                const ty = l.typeOf(bin_op.rhs);
+                if (!ty.isVector(zcu)) break :done;
+                if (bin_op.rhs.toInterned()) |rhs_ip_index| switch (ip.indexToKey(rhs_ip_index)) {
                     else => {},
-                    .splat => continue :inst l.replaceInst(inst, air_tag, .{ .bin_op = .{
-                        .lhs = bin_op.lhs,
-                        .rhs = data[@intFromEnum(rhs_inst)].ty_op.operand,
+                    .aggregate => |aggregate| switch (aggregate.storage) {
+                        else => {},
+                        .repeated_elem => |splat| continue :inst l.replaceInst(inst, air_tag, .{ .bin_op = .{
+                            .lhs = bin_op.lhs,
+                            .rhs = Air.internedToRef(splat),
+                        } }),
+                    },
+                } else {
+                    const rhs_inst = bin_op.rhs.toIndex().?;
+                    switch (l.air_instructions.items(.tag)[@intFromEnum(rhs_inst)]) {
+                        else => {},
+                        .splat => continue :inst l.replaceInst(inst, air_tag, .{ .bin_op = .{
+                            .lhs = bin_op.lhs,
+                            .rhs = l.air_instructions.items(.data)[@intFromEnum(rhs_inst)].ty_op.operand,
+                        } }),
+                    }
+                }
+            },
+
+            .reduce,
+            .reduce_optimized,
+            => if (l.features.contains(.reduce_one_elem_to_bitcast)) done: {
+                const reduce = l.air_instructions.items(.data)[@intFromEnum(inst)].reduce;
+                const vector_ty = l.typeOf(reduce.operand);
+                switch (vector_ty.vectorLen(zcu)) {
+                    0 => unreachable,
+                    1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
+                        .ty = Air.internedToRef(vector_ty.scalarType(zcu).toIntern()),
+                        .operand = reduce.operand,
                     } }),
+                    else => break :done,
                 }
-            }
-        },
+            },
 
-        .reduce,
-        .reduce_optimized,
-        => if (l.features.contains(.reduce_one_elem_to_bitcast)) done: {
-            const reduce = data[@intFromEnum(inst)].reduce;
-            const vector_ty = l.air.typeOf(reduce.operand, ip);
-            switch (vector_ty.vectorLen(zcu)) {
-                0 => unreachable,
-                1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
-                    .ty = Air.internedToRef(vector_ty.scalarType(zcu).toIntern()),
-                    .operand = reduce.operand,
-                } }),
-                else => break :done,
-            }
-        },
+            .@"try", .try_cold => {
+                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
+                const extra = l.extraData(Air.Try, pl_op.payload);
+                try l.legalizeBody(extra.end, extra.data.body_len);
+            },
+            .try_ptr, .try_ptr_cold => {
+                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
+                const extra = l.extraData(Air.TryPtr, ty_pl.payload);
+                try l.legalizeBody(extra.end, extra.data.body_len);
+            },
+            .block, .loop => {
+                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
+                const extra = l.extraData(Air.Block, ty_pl.payload);
+                try l.legalizeBody(extra.end, extra.data.body_len);
+            },
+            .dbg_inline_block => {
+                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
+                const extra = l.extraData(Air.DbgInlineBlock, ty_pl.payload);
+                try l.legalizeBody(extra.end, extra.data.body_len);
+            },
+            .cond_br => {
+                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
+                const extra = l.extraData(Air.CondBr, pl_op.payload);
+                try l.legalizeBody(extra.end, extra.data.then_body_len);
+                try l.legalizeBody(extra.end + extra.data.then_body_len, extra.data.else_body_len);
+            },
+            .switch_br, .loop_switch_br => {
+                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
+                const extra = l.extraData(Air.SwitchBr, pl_op.payload);
+                const hint_bag_count = std.math.divCeil(usize, extra.data.cases_len + 1, 10) catch unreachable;
+                var extra_index = extra.end + hint_bag_count;
+                for (0..extra.data.cases_len) |_| {
+                    const case_extra = l.extraData(Air.SwitchBr.Case, extra_index);
+                    const case_body_start = case_extra.end + case_extra.data.items_len + case_extra.data.ranges_len * 2;
+                    try l.legalizeBody(case_body_start, case_extra.data.body_len);
+                    extra_index = case_body_start + case_extra.data.body_len;
+                }
+                try l.legalizeBody(extra_index, extra.data.else_body_len);
+            },
+        }
+    }
+}
 
-        .@"try", .try_cold => {
-            const pl_op = data[@intFromEnum(inst)].pl_op;
-            const extra = l.air.extraData(Air.Try, pl_op.payload);
-            try l.legalizeBody(@ptrCast(l.air.extra.items[extra.end..][0..extra.data.body_len]));
-        },
-        .try_ptr, .try_ptr_cold => {
-            const ty_pl = data[@intFromEnum(inst)].ty_pl;
-            const extra = l.air.extraData(Air.TryPtr, ty_pl.payload);
-            try l.legalizeBody(@ptrCast(l.air.extra.items[extra.end..][0..extra.data.body_len]));
-        },
-        .block, .loop => {
-            const ty_pl = data[@intFromEnum(inst)].ty_pl;
-            const extra = l.air.extraData(Air.Block, ty_pl.payload);
-            try l.legalizeBody(@ptrCast(l.air.extra.items[extra.end..][0..extra.data.body_len]));
-        },
-        .dbg_inline_block => {
-            const ty_pl = data[@intFromEnum(inst)].ty_pl;
-            const extra = l.air.extraData(Air.DbgInlineBlock, ty_pl.payload);
-            try l.legalizeBody(@ptrCast(l.air.extra.items[extra.end..][0..extra.data.body_len]));
-        },
-        .cond_br => {
-            const pl_op = data[@intFromEnum(inst)].pl_op;
-            const extra = l.air.extraData(Air.CondBr, pl_op.payload);
-            try l.legalizeBody(@ptrCast(l.air.extra.items[extra.end..][0..extra.data.then_body_len]));
-            try l.legalizeBody(@ptrCast(l.air.extra.items[extra.end + extra.data.then_body_len ..][0..extra.data.else_body_len]));
-        },
-        .switch_br, .loop_switch_br => {
-            const switch_br = l.air.unwrapSwitch(inst);
-            var it = switch_br.iterateCases();
-            while (it.next()) |case| try l.legalizeBody(case.body);
-            try l.legalizeBody(it.elseBody());
+const UnaryDataTag = enum { un_op, ty_op };
+inline fn scalarizeUnary(l: *Legalize, inst: Air.Inst.Index, data_tag: UnaryDataTag, un_op: Air.Inst.Ref) Error!Air.Inst.Tag {
+    return l.replaceInst(inst, .block, try l.scalarizeUnaryBlockPayload(inst, data_tag, un_op));
+}
+fn scalarizeUnaryBlockPayload(
+    l: *Legalize,
+    inst: Air.Inst.Index,
+    data_tag: UnaryDataTag,
+    un_op: Air.Inst.Ref,
+) Error!Air.Inst.Data {
+    const pt = l.pt;
+    const zcu = pt.zcu;
+    const gpa = zcu.gpa;
+
+    const res_ty = l.typeOfIndex(inst);
+    try l.air_instructions.ensureUnusedCapacity(gpa, 15);
+    const res_alloc_inst = l.addInstAssumeCapacity(.{
+        .tag = .alloc,
+        .data = .{ .ty = try pt.singleMutPtrType(res_ty) },
+    });
+    const index_alloc_inst = l.addInstAssumeCapacity(.{
+        .tag = .alloc,
+        .data = .{ .ty = try pt.singleMutPtrType(.usize) },
+    });
+    const index_init_inst = l.addInstAssumeCapacity(.{
+        .tag = .store,
+        .data = .{ .bin_op = .{
+            .lhs = index_alloc_inst.toRef(),
+            .rhs = try pt.intRef(.usize, 0),
+        } },
+    });
+    const cur_index_inst = l.addInstAssumeCapacity(.{
+        .tag = .load,
+        .data = .{ .ty_op = .{
+            .ty = .usize_type,
+            .operand = index_alloc_inst.toRef(),
+        } },
+    });
+    const get_elem_inst = l.addInstAssumeCapacity(.{
+        .tag = .array_elem_val,
+        .data = .{ .bin_op = .{
+            .lhs = un_op,
+            .rhs = cur_index_inst.toRef(),
+        } },
+    });
+    const op_elem_inst = l.addInstAssumeCapacity(.{
+        .tag = l.air_instructions.items(.tag)[@intFromEnum(inst)],
+        .data = switch (data_tag) {
+            .un_op => .{ .un_op = get_elem_inst.toRef() },
+            .ty_op => .{ .ty_op = .{
+                .ty = Air.internedToRef(res_ty.scalarType(zcu).toIntern()),
+                .operand = get_elem_inst.toRef(),
+            } },
         },
-    };
+    });
+    const set_elem_inst = l.addInstAssumeCapacity(.{
+        .tag = .vector_store_elem,
+        .data = .{ .vector_store_elem = .{
+            .vector_ptr = res_alloc_inst.toRef(),
+            .payload = try l.addExtra(Air.Bin, .{
+                .lhs = cur_index_inst.toRef(),
+                .rhs = op_elem_inst.toRef(),
+            }),
+        } },
+    });
+    const not_done_inst = l.addInstAssumeCapacity(.{
+        .tag = .cmp_lt,
+        .data = .{ .bin_op = .{
+            .lhs = cur_index_inst.toRef(),
+            .rhs = try pt.intRef(.usize, res_ty.vectorLen(zcu)),
+        } },
+    });
+    const next_index_inst = l.addInstAssumeCapacity(.{
+        .tag = .add,
+        .data = .{ .bin_op = .{
+            .lhs = cur_index_inst.toRef(),
+            .rhs = try pt.intRef(.usize, 1),
+        } },
+    });
+    const set_index_inst = l.addInstAssumeCapacity(.{
+        .tag = .store,
+        .data = .{ .bin_op = .{
+            .lhs = index_alloc_inst.toRef(),
+            .rhs = next_index_inst.toRef(),
+        } },
+    });
+    const loop_inst: Air.Inst.Index = @enumFromInt(l.air_instructions.len + 4);
+    const repeat_inst = l.addInstAssumeCapacity(.{
+        .tag = .repeat,
+        .data = .{ .repeat = .{ .loop_inst = loop_inst } },
+    });
+    const final_res_inst = l.addInstAssumeCapacity(.{
+        .tag = .load,
+        .data = .{ .ty_op = .{
+            .ty = Air.internedToRef(res_ty.toIntern()),
+            .operand = res_alloc_inst.toRef(),
+        } },
+    });
+    const br_res_inst = l.addInstAssumeCapacity(.{
+        .tag = .br,
+        .data = .{ .br = .{
+            .block_inst = inst,
+            .operand = final_res_inst.toRef(),
+        } },
+    });
+    const done_br_inst = l.addInstAssumeCapacity(.{
+        .tag = .cond_br,
+        .data = .{ .pl_op = .{
+            .operand = not_done_inst.toRef(),
+            .payload = try l.addCondBrBodies(&.{
+                next_index_inst,
+                set_index_inst,
+                repeat_inst,
+            }, &.{
+                final_res_inst,
+                br_res_inst,
+            }),
+        } },
+    });
+    assert(loop_inst == l.addInstAssumeCapacity(.{
+        .tag = .loop,
+        .data = .{ .ty_pl = .{
+            .ty = .noreturn_type,
+            .payload = try l.addBlockBody(&.{
+                cur_index_inst,
+                get_elem_inst,
+                op_elem_inst,
+                set_elem_inst,
+                not_done_inst,
+                done_br_inst,
+            }),
+        } },
+    }));
+    return .{ .ty_pl = .{
+        .ty = Air.internedToRef(res_ty.toIntern()),
+        .payload = try l.addBlockBody(&.{
+            res_alloc_inst,
+            index_alloc_inst,
+            index_init_inst,
+            loop_inst,
+        }),
+    } };
+}
+
+fn addInstAssumeCapacity(l: *Legalize, inst: Air.Inst) Air.Inst.Index {
+    defer l.air_instructions.appendAssumeCapacity(inst);
+    return @enumFromInt(l.air_instructions.len);
+}
+
+fn addExtra(l: *Legalize, comptime Extra: type, extra: Extra) Error!u32 {
+    const extra_fields = @typeInfo(Extra).@"struct".fields;
+    try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, extra_fields.len);
+    defer inline for (extra_fields) |field| l.air_extra.appendAssumeCapacity(switch (field.type) {
+        u32 => @field(extra, field.name),
+        Air.Inst.Ref => @intFromEnum(@field(extra, field.name)),
+        else => @compileError(@typeName(field.type)),
+    });
+    return @intCast(l.air_extra.items.len);
+}
+
+fn addBlockBody(l: *Legalize, body: []const Air.Inst.Index) Error!u32 {
+    try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, 1 + body.len);
+    defer {
+        l.air_extra.appendAssumeCapacity(@intCast(body.len));
+        l.air_extra.appendSliceAssumeCapacity(@ptrCast(body));
+    }
+    return @intCast(l.air_extra.items.len);
+}
+
+fn addCondBrBodies(l: *Legalize, then_body: []const Air.Inst.Index, else_body: []const Air.Inst.Index) Error!u32 {
+    try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, 3 + then_body.len + else_body.len);
+    defer {
+        l.air_extra.appendSliceAssumeCapacity(&.{
+            @intCast(then_body.len),
+            @intCast(else_body.len),
+            @bitCast(Air.CondBr.BranchHints{
+                .true = .none,
+                .false = .none,
+                .then_cov = .none,
+                .else_cov = .none,
+            }),
+        });
+        l.air_extra.appendSliceAssumeCapacity(@ptrCast(then_body));
+        l.air_extra.appendSliceAssumeCapacity(@ptrCast(else_body));
+    }
+    return @intCast(l.air_extra.items.len);
 }
 
 // inline to propagate comptime `tag`s
 inline fn replaceInst(l: *Legalize, inst: Air.Inst.Index, tag: Air.Inst.Tag, data: Air.Inst.Data) Air.Inst.Tag {
-    const ip = &l.zcu.intern_pool;
-    const orig_ty = if (std.debug.runtime_safety) l.air.typeOfIndex(inst, ip) else {};
-    l.air.instructions.items(.tag)[@intFromEnum(inst)] = tag;
-    l.air.instructions.items(.data)[@intFromEnum(inst)] = data;
-    if (std.debug.runtime_safety) std.debug.assert(l.air.typeOfIndex(inst, ip).toIntern() == orig_ty.toIntern());
+    const orig_ty = if (std.debug.runtime_safety) l.typeOfIndex(inst) else {};
+    l.air_instructions.set(@intFromEnum(inst), .{ .tag = tag, .data = data });
+    if (std.debug.runtime_safety) assert(l.typeOfIndex(inst).toIntern() == orig_ty.toIntern());
     return tag;
 }
 
 const Air = @import("../Air.zig");
+const assert = std.debug.assert;
 const codegen = @import("../codegen.zig");
 const Legalize = @This();
 const std = @import("std");
+const Type = @import("../Type.zig");
 const Zcu = @import("../Zcu.zig");
src/arch/x86_64/CodeGen.zig
@@ -32,10 +32,15 @@ const FrameIndex = bits.FrameIndex;
 
 const InnerError = codegen.CodeGenError || error{OutOfRegisters};
 
-pub const legalize_features: Air.Legalize.Features = .{
+pub const legalize_features: Air.Legalize.Features = .init(.{
+    .scalarize_ctz = true,
+    .scalarize_popcount = true,
+    .scalarize_byte_swap = true,
+    .scalarize_bit_reverse = true,
+
     .remove_shift_vector_rhs_splat = false,
     .reduce_one_elem_to_bitcast = true,
-};
+});
 
 /// Set this to `false` to uncover Sema OPV bugs.
 /// https://github.com/ziglang/zig/issues/22419
@@ -63352,14 +63357,14 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 defer assert(cg.loops.remove(inst));
                 try cg.genBodyBlock(@ptrCast(cg.air.extra.items[block.end..][0..block.data.body_len]));
             },
-            .repeat => if (use_old) try cg.airRepeat(inst) else {
+            .repeat => {
                 const repeat = air_datas[@intFromEnum(inst)].repeat;
                 const loop = cg.loops.get(repeat.loop_inst).?;
                 try cg.restoreState(loop.state, &.{}, .{
                     .emit_instructions = true,
                     .update_tracking = false,
                     .resurrect = false,
-                    .close_scope = true,
+                    .close_scope = false,
                 });
                 _ = try cg.asmJmpReloc(loop.target);
             },
@@ -162356,6 +162361,136 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .each = .{ .once = &.{
                         .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .src2w, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .vp_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .p_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ },
+                        .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp0w, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .f32, .kind = .mem },
+                        .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ },
+                        .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp1w, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .vp_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse4_1, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .p_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ },
+                        .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp0w, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .word } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .extra_temps = .{
+                        .{ .type = .f32, .kind = .mem },
+                        .{ .type = .f16, .kind = .{ .rc = .general_purpose } },
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                        .unused,
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ },
+                        .{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ },
+                        .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp1w, ._, ._ },
+                    } },
                 }, .{
                     .src_constraints = .{ .any, .any, .{ .int = .dword } },
                     .patterns = &.{
@@ -162374,30 +162509,120 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                     .each = .{ .once = &.{
                         .{ ._, ._, .mov, .leasi(.src0d, .@"4", .src1), .src2d, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .dword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .dword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .dword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .v_ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .dword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ },
+                    } },
                 }, .{
                     .required_features = .{ .@"64bit", null, null, null },
-                    .dst_constraints = .{ .{ .int = .qword }, .any },
+                    .src_constraints = .{ .any, .any, .{ .int = .qword } },
                     .patterns = &.{
-                        .{ .src = .{ .to_mem, .simm32, .simm32 } },
-                        .{ .src = .{ .to_mem, .simm32, .to_gpr } },
+                        .{ .src = .{ .to_gpr, .simm32, .simm32 } },
+                        .{ .src = .{ .to_gpr, .simm32, .to_gpr } },
                     },
                     .each = .{ .once = &.{
                         .{ ._, ._, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2q, ._, ._ },
                     } },
                 }, .{
                     .required_features = .{ .@"64bit", null, null, null },
-                    .dst_constraints = .{ .{ .int = .qword }, .any },
+                    .src_constraints = .{ .any, .any, .{ .int = .qword } },
                     .patterns = &.{
-                        .{ .src = .{ .to_mem, .to_gpr, .simm32 } },
-                        .{ .src = .{ .to_mem, .to_gpr, .to_gpr } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .simm32 } },
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
                     },
                     .each = .{ .once = &.{
                         .{ ._, ._, .mov, .leasi(.src0q, .@"8", .src1), .src2q, ._, ._ },
                     } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .qword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .v_sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .qword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .qword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .simm32, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._ps, .movl, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .avx, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .qword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, .v_sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse2, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .qword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
+                    } },
+                }, .{
+                    .required_features = .{ .sse, null, null, null },
+                    .src_constraints = .{ .any, .any, .{ .float = .qword } },
+                    .patterns = &.{
+                        .{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
+                    },
+                    .each = .{ .once = &.{
+                        .{ ._, ._ps, .movl, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
+                    } },
                 } }) catch |err| switch (err) {
                     error.SelectFailed => {
                         const elem_size = cg.typeOf(bin_op.rhs).abiSize(zcu);
-                        while (try ops[0].toBase(false, cg) or
+                        while (try ops[0].toRegClass(true, .general_purpose, cg) or
                             try ops[1].toRegClass(true, .general_purpose, cg))
                         {}
                         const base_reg = ops[0].tracking(cg).short.register.to64();
@@ -162410,11 +162635,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 rhs_reg,
                                 .u(elem_size),
                             );
-                            try cg.asmRegisterMemory(
-                                .{ ._, .lea },
-                                base_reg,
-                                try ops[0].tracking(cg).short.mem(cg, .{ .index = rhs_reg }),
-                            );
+                            try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
+                                .base = .{ .reg = base_reg },
+                                .mod = .{ .rm = .{ .index = rhs_reg } },
+                            });
                         } else if (elem_size > 8) {
                             try cg.spillEflagsIfOccupied();
                             try cg.asmRegisterImmediate(
@@ -162422,20 +162646,18 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                                 rhs_reg,
                                 .u(std.math.log2_int(u64, elem_size)),
                             );
-                            try cg.asmRegisterMemory(
-                                .{ ._, .lea },
-                                base_reg,
-                                try ops[0].tracking(cg).short.mem(cg, .{ .index = rhs_reg }),
-                            );
-                        } else try cg.asmRegisterMemory(
-                            .{ ._, .lea },
-                            base_reg,
-                            try ops[0].tracking(cg).short.mem(cg, .{
+                            try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
+                                .base = .{ .reg = base_reg },
+                                .mod = .{ .rm = .{ .index = rhs_reg } },
+                            });
+                        } else try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
+                            .base = .{ .reg = base_reg },
+                            .mod = .{ .rm = .{
                                 .index = rhs_reg,
                                 .scale = .fromFactor(@intCast(elem_size)),
-                            }),
-                        );
-                        try ops[0].store(&ops[1], .{}, cg);
+                            } },
+                        });
+                        try ops[0].store(&ops[2], .{}, cg);
                     },
                     else => |e| return e,
                 };
@@ -174453,18 +174675,6 @@ fn airBr(self: *CodeGen, inst: Air.Inst.Index) !void {
     try self.freeValue(block_tracking.short);
 }
 
-fn airRepeat(self: *CodeGen, inst: Air.Inst.Index) !void {
-    const loop_inst = self.air.instructions.items(.data)[@intFromEnum(inst)].repeat.loop_inst;
-    const repeat_info = self.loops.get(loop_inst).?;
-    try self.restoreState(repeat_info.state, &.{}, .{
-        .emit_instructions = true,
-        .update_tracking = false,
-        .resurrect = false,
-        .close_scope = true,
-    });
-    _ = try self.asmJmpReloc(repeat_info.target);
-}
-
 fn airAsm(self: *CodeGen, inst: Air.Inst.Index) !void {
     @setEvalBranchQuota(1_100);
     const pt = self.pt;
src/Zcu/PerThread.zig
@@ -1742,7 +1742,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A
     }
 
     const backend = target_util.zigBackend(zcu.root_mod.resolved_target.result, zcu.comp.config.use_llvm);
-    try air.legalize(backend, zcu);
+    try air.legalize(backend, pt);
 
     var liveness = try Air.Liveness.analyze(gpa, air.*, ip);
     defer liveness.deinit(gpa);
test/behavior/x86_64/unary.zig
@@ -4828,6 +4828,7 @@ inline fn ctz(comptime Type: type, rhs: Type) @TypeOf(@ctz(rhs)) {
 test ctz {
     const test_ctz = unary(ctz, .{});
     try test_ctz.testInts();
+    try test_ctz.testIntVectors();
 }
 
 inline fn popCount(comptime Type: type, rhs: Type) @TypeOf(@popCount(rhs)) {
@@ -4836,6 +4837,7 @@ inline fn popCount(comptime Type: type, rhs: Type) @TypeOf(@popCount(rhs)) {
 test popCount {
     const test_pop_count = unary(popCount, .{});
     try test_pop_count.testInts();
+    try test_pop_count.testIntVectors();
 }
 
 inline fn byteSwap(comptime Type: type, rhs: Type) RoundBitsUp(Type, 8) {
@@ -4844,6 +4846,7 @@ inline fn byteSwap(comptime Type: type, rhs: Type) RoundBitsUp(Type, 8) {
 test byteSwap {
     const test_byte_swap = unary(byteSwap, .{});
     try test_byte_swap.testInts();
+    try test_byte_swap.testIntVectors();
 }
 
 inline fn bitReverse(comptime Type: type, rhs: Type) @TypeOf(@bitReverse(rhs)) {
@@ -4852,6 +4855,7 @@ inline fn bitReverse(comptime Type: type, rhs: Type) @TypeOf(@bitReverse(rhs)) {
 test bitReverse {
     const test_bit_reverse = unary(bitReverse, .{});
     try test_bit_reverse.testInts();
+    try test_bit_reverse.testIntVectors();
 }
 
 inline fn sqrt(comptime Type: type, rhs: Type) @TypeOf(@sqrt(rhs)) {
test/behavior/bitreverse.zig
@@ -123,12 +123,12 @@ fn vector8() !void {
 
 test "bitReverse vectors u8" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime vector8();
     try vector8();
@@ -144,12 +144,12 @@ fn vector16() !void {
 
 test "bitReverse vectors u16" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime vector16();
     try vector16();
@@ -165,12 +165,12 @@ fn vector24() !void {
 
 test "bitReverse vectors u24" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime vector24();
     try vector24();
test/behavior/byteswap.zig
@@ -95,12 +95,12 @@ fn vector8() !void {
 
 test "@byteSwap vectors u8" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime vector8();
     try vector8();
@@ -116,12 +116,12 @@ fn vector16() !void {
 
 test "@byteSwap vectors u16" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime vector16();
     try vector16();
@@ -137,12 +137,12 @@ fn vector24() !void {
 
 test "@byteSwap vectors u24" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime vector24();
     try vector24();
test/behavior/math.zig
@@ -193,12 +193,12 @@ fn testCtz128() !void {
 
 test "@ctz vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try testCtzVectors();
     try comptime testCtzVectors();
test/behavior/popcount.zig
@@ -77,12 +77,12 @@ fn testPopCountIntegers() !void {
 
 test "@popCount vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_riscv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
 
     try comptime testPopCountVectors();
     try testPopCountVectors();