Commit 8f69e977f1

Jacob Young <jacobly0@users.noreply.github.com>
2023-10-23 11:38:11
x86_64: implement 128-bit builtins
* `@clz` * `@ctz` * `@popCount` * `@byteSwap` * `@bitReverse` * various encodings used by std
1 parent fbe8c89
lib/compiler_rt/addf3_test.zig
@@ -76,9 +76,6 @@ fn test__subtf3(a: f128, b: f128, expected_hi: u64, expected_lo: u64) !void {
 }
 
 test "subtf3" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     // qNaN - any = qNaN
     try test__subtf3(qnan128, 0x1.23456789abcdefp+5, 0x7fff800000000000, 0x0);
 
lib/compiler_rt/divtf3_test.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const builtin = @import("builtin");
 const math = std.math;
 const testing = std.testing;
 
@@ -31,9 +30,6 @@ fn test__divtf3(a: f128, b: f128, expectedHi: u64, expectedLo: u64) !void {
 }
 
 test "divtf3" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     // NaN / any = NaN
     try test__divtf3(math.nan(f128), 0x1.23456789abcdefp+5, 0x7fff800000000000, 0);
     // inf / any(except inf and nan) = inf
lib/compiler_rt/float_from_int_test.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const builtin = @import("builtin");
 const testing = std.testing;
 const math = std.math;
 
@@ -126,9 +125,6 @@ fn test__floatuntisf(a: u128, expected: f32) !void {
 }
 
 test "floattisf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floattisf(0, 0.0);
 
     try test__floattisf(1, 1.0);
@@ -175,9 +171,6 @@ test "floattisf" {
 }
 
 test "floatuntisf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floatuntisf(0, 0.0);
 
     try test__floatuntisf(1, 1.0);
@@ -374,9 +367,6 @@ fn test__floatuntidf(a: u128, expected: f64) !void {
 }
 
 test "floattidf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floattidf(0, 0.0);
 
     try test__floattidf(1, 1.0);
@@ -447,9 +437,6 @@ test "floattidf" {
 }
 
 test "floatuntidf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floatuntidf(0, 0.0);
 
     try test__floatuntidf(1, 1.0);
@@ -595,9 +582,6 @@ test "floatditf" {
 }
 
 test "floatunditf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floatunditf(0xffffffffffffffff, 0x403effffffffffff, 0xfffe000000000000);
     try test__floatunditf(0xfffffffffffffffe, 0x403effffffffffff, 0xfffc000000000000);
     try test__floatunditf(0x8000000000000000, 0x403e000000000000, 0x0);
@@ -619,9 +603,6 @@ fn test__floatuntitf(a: u128, expected: f128) !void {
 }
 
 test "floattitf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floattitf(0, 0.0);
 
     try test__floattitf(1, 1.0);
@@ -704,9 +685,6 @@ test "floattitf" {
 }
 
 test "floatuntitf" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     try test__floatuntitf(0, 0.0);
 
     try test__floatuntitf(1, 1.0);
lib/compiler_rt/fma.zig
@@ -6,10 +6,8 @@
 //! https://git.musl-libc.org/cgit/musl/tree/src/math/fma.c
 
 const std = @import("std");
-const builtin = @import("builtin");
 const math = std.math;
 const expect = std.testing.expect;
-const arch = builtin.cpu.arch;
 const common = @import("common.zig");
 
 pub const panic = common.panic;
@@ -343,9 +341,6 @@ test "64" {
 }
 
 test "128" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     const epsilon = 0.000001;
 
     try expect(math.approxEqAbs(f128, fmaq(0.0, 5.0, 9.124), 9.124, epsilon));
lib/compiler_rt/fmodx_test.zig
@@ -24,7 +24,7 @@ fn test_fmodx_infs() !void {
 
 test "fmodx" {
     if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .lzcnt)) return error.SkipZigTest;
 
     try test_fmodx(6.4, 4.0, 2.4);
     try test_fmodx(6.4, -4.0, 2.4);
lib/compiler_rt/mulf3_test.zig
@@ -3,7 +3,6 @@
 // https://github.com/llvm/llvm-project/blob/2ffb1b0413efa9a24eb3c49e710e36f92e2cb50b/compiler-rt/test/builtins/Unit/multf3_test.c
 
 const std = @import("std");
-const builtin = @import("builtin");
 const math = std.math;
 const qnan128: f128 = @bitCast(@as(u128, 0x7fff800000000000) << 64);
 const inf128: f128 = @bitCast(@as(u128, 0x7fff000000000000) << 64);
@@ -49,9 +48,6 @@ fn makeNaN128(rand: u64) f128 {
     return @bitCast(int_result);
 }
 test "multf3" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .bmi, .lzcnt })) return error.SkipZigTest;
-
     // qNaN * any = qNaN
     try test__multf3(qnan128, 0x1.23456789abcdefp+5, 0x7fff800000000000, 0x0);
 
lib/compiler_rt/udivmodei4.zig
@@ -129,10 +129,8 @@ pub fn __umodei4(r_p: [*]u32, u_p: [*]const u32, v_p: [*]const u32, bits: usize)
 }
 
 test "__udivei4/__umodei4" {
-    switch (builtin.zig_backend) {
-        .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
-        else => {},
-    }
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const RndGen = std.rand.DefaultPrng;
     var rnd = RndGen.init(42);
lib/std/Build/Cache.zig
@@ -1007,8 +1007,6 @@ test "cache file and then recall it" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1075,8 +1073,6 @@ test "check that changing a file makes cache fail" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1151,8 +1147,6 @@ test "no file inputs" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1201,8 +1195,6 @@ test "Manifest with files added after initial hash work" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = testing.tmpDir(.{});
     defer tmp.cleanup();
 
lib/std/compress/deflate/huffman_code.zig
@@ -360,8 +360,6 @@ fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
 }
 
 test "generate a Huffman code from an array of frequencies" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var freqs: [19]u16 = [_]u16{
         8, // 0
         1, // 1
lib/std/compress/gzip.zig
@@ -174,8 +174,6 @@ fn testReader(data: []const u8, comptime expected: []const u8) !void {
 // https://tools.ietf.org/rfc/rfc1952.txt length=25037 bytes
 // SHA256=164ef0897b4cbec63abf1b57f069f3599bd0fb7c72c2a4dee21bd7e03ec9af67
 test "compressed data" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testReader(
         @embedFile("testdata/rfc1952.txt.gz"),
         @embedFile("testdata/rfc1952.txt"),
lib/std/compress/zlib.zig
@@ -199,8 +199,6 @@ fn testDecompress(data: []const u8, expected: []const u8) !void {
 // https://tools.ietf.org/rfc/rfc1951.txt length=36944 bytes
 // SHA256=5ebf4b5b7fe1c3a0c0ab9aa3ac8c0f3853a7dc484905e76e03b0b0f301350009
 test "compressed data" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const rfc1951_txt = @embedFile("testdata/rfc1951.txt");
 
     // Compressed with compression level = 0
lib/std/crypto/25519/x25519.zig
@@ -137,8 +137,6 @@ test "x25519 rfc7748 one iteration" {
 }
 
 test "x25519 rfc7748 1,000 iterations" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // These iteration tests are slow so we always skip them. Results have been verified.
     if (true) {
         return error.SkipZigTest;
@@ -161,8 +159,6 @@ test "x25519 rfc7748 1,000 iterations" {
 }
 
 test "x25519 rfc7748 1,000,000 iterations" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (true) {
         return error.SkipZigTest;
     }
lib/std/crypto/aegis.zig
@@ -17,7 +17,6 @@
 //! https://datatracker.ietf.org/doc/draft-irtf-cfrg-aegis-aead/
 
 const std = @import("std");
-const builtin = @import("builtin");
 const crypto = std.crypto;
 const mem = std.mem;
 const assert = std.debug.assert;
@@ -514,8 +513,6 @@ const htest = @import("test.zig");
 const testing = std.testing;
 
 test "Aegis128L test vector 1" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aegis128L.key_length]u8 = [_]u8{ 0x10, 0x01 } ++ [_]u8{0x00} ** 14;
     const nonce: [Aegis128L.nonce_length]u8 = [_]u8{ 0x10, 0x00, 0x02 } ++ [_]u8{0x00} ** 13;
     const ad = [8]u8{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
@@ -539,8 +536,6 @@ test "Aegis128L test vector 1" {
 }
 
 test "Aegis128L test vector 2" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aegis128L.key_length]u8 = [_]u8{0x00} ** 16;
     const nonce: [Aegis128L.nonce_length]u8 = [_]u8{0x00} ** 16;
     const ad = [_]u8{};
@@ -558,8 +553,6 @@ test "Aegis128L test vector 2" {
 }
 
 test "Aegis128L test vector 3" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aegis128L.key_length]u8 = [_]u8{0x00} ** 16;
     const nonce: [Aegis128L.nonce_length]u8 = [_]u8{0x00} ** 16;
     const ad = [_]u8{};
@@ -576,8 +569,6 @@ test "Aegis128L test vector 3" {
 }
 
 test "Aegis256 test vector 1" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aegis256.key_length]u8 = [_]u8{ 0x10, 0x01 } ++ [_]u8{0x00} ** 30;
     const nonce: [Aegis256.nonce_length]u8 = [_]u8{ 0x10, 0x00, 0x02 } ++ [_]u8{0x00} ** 29;
     const ad = [8]u8{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
@@ -601,8 +592,6 @@ test "Aegis256 test vector 1" {
 }
 
 test "Aegis256 test vector 2" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aegis256.key_length]u8 = [_]u8{0x00} ** 32;
     const nonce: [Aegis256.nonce_length]u8 = [_]u8{0x00} ** 32;
     const ad = [_]u8{};
@@ -620,8 +609,6 @@ test "Aegis256 test vector 2" {
 }
 
 test "Aegis256 test vector 3" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aegis256.key_length]u8 = [_]u8{0x00} ** 32;
     const nonce: [Aegis256.nonce_length]u8 = [_]u8{0x00} ** 32;
     const ad = [_]u8{};
@@ -638,8 +625,6 @@ test "Aegis256 test vector 3" {
 }
 
 test "Aegis MAC" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{0x00} ** Aegis128LMac.key_length;
     var msg: [64]u8 = undefined;
     for (&msg, 0..) |*m, i| {
lib/std/crypto/aes.zig
@@ -6,7 +6,7 @@ const has_aesni = std.Target.x86.featureSetHas(builtin.cpu.features, .aes);
 const has_avx = std.Target.x86.featureSetHas(builtin.cpu.features, .avx);
 const has_armaes = std.Target.aarch64.featureSetHas(builtin.cpu.features, .aes);
 // C backend doesn't currently support passing vectors to inline asm.
-const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and has_aesni and has_avx) impl: {
+const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and has_aesni and has_avx) impl: {
     break :impl @import("aes/aesni.zig");
 } else if (builtin.cpu.arch == .aarch64 and builtin.zig_backend != .stage2_c and has_armaes)
 impl: {
@@ -55,8 +55,6 @@ test "ctr" {
 }
 
 test "encrypt" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // Appendix B
     {
         const key = [_]u8{ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c };
@@ -86,8 +84,6 @@ test "encrypt" {
 }
 
 test "decrypt" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // Appendix B
     {
         const key = [_]u8{ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c };
@@ -117,8 +113,6 @@ test "decrypt" {
 }
 
 test "expand 128-bit key" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c };
     const exp_enc = [_]*const [32:0]u8{
         "2b7e151628aed2a6abf7158809cf4f3c", "a0fafe1788542cb123a339392a6c7605", "f2c295f27a96b9435935807a7359f67f", "3d80477d4716fe3e1e237e446d7a883b", "ef44a541a8525b7fb671253bdb0bad00", "d4d1c6f87c839d87caf2b8bc11f915bc", "6d88a37a110b3efddbf98641ca0093fd", "4e54f70e5f5fc9f384a64fb24ea6dc4f", "ead27321b58dbad2312bf5607f8d292f", "ac7766f319fadc2128d12941575c006e", "d014f9a8c9ee2589e13f0cc8b6630ca6",
@@ -141,8 +135,6 @@ test "expand 128-bit key" {
 }
 
 test "expand 256-bit key" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{
         0x60, 0x3d, 0xeb, 0x10,
         0x15, 0xca, 0x71, 0xbe,
lib/std/crypto/aes_ocb.zig
@@ -261,10 +261,8 @@ inline fn xorWith(x: *Block, y: Block) void {
 const hexToBytes = std.fmt.hexToBytes;
 
 test "AesOcb test vector 1" {
-    switch (builtin.zig_backend) {
-        .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
-        else => {},
-    }
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     var k: [Aes128Ocb.key_length]u8 = undefined;
     var nonce: [Aes128Ocb.nonce_length]u8 = undefined;
@@ -283,10 +281,8 @@ test "AesOcb test vector 1" {
 }
 
 test "AesOcb test vector 2" {
-    switch (builtin.zig_backend) {
-        .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
-        else => {},
-    }
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     var k: [Aes128Ocb.key_length]u8 = undefined;
     var nonce: [Aes128Ocb.nonce_length]u8 = undefined;
@@ -307,10 +303,8 @@ test "AesOcb test vector 2" {
 }
 
 test "AesOcb test vector 3" {
-    switch (builtin.zig_backend) {
-        .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
-        else => {},
-    }
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     var k: [Aes128Ocb.key_length]u8 = undefined;
     var nonce: [Aes128Ocb.nonce_length]u8 = undefined;
@@ -334,10 +328,8 @@ test "AesOcb test vector 3" {
 }
 
 test "AesOcb test vector 4" {
-    switch (builtin.zig_backend) {
-        .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
-        else => {},
-    }
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     var k: [Aes128Ocb.key_length]u8 = undefined;
     var nonce: [Aes128Ocb.nonce_length]u8 = undefined;
lib/std/crypto/ghash_polyval.zig
@@ -476,8 +476,6 @@ test "ghash2" {
 }
 
 test "polyval" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{0x42} ** 16;
     const m = [_]u8{0x69} ** 256;
 
lib/std/crypto/kyber_d00.zig
@@ -1667,8 +1667,6 @@ test "Test happy flow" {
 const sha2 = crypto.hash.sha2;
 
 test "NIST KAT test" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     inline for (.{
         .{ Kyber512, "e9c2bd37133fcb40772f81559f14b1f58dccd1c816701be9ba6214d43baf4547" },
         .{ Kyber1024, "89248f2f33f7f4f7051729111f3049c409a933ec904aedadf035f30fa5646cd5" },
lib/std/fs/test.zig
@@ -124,8 +124,6 @@ fn testWithAllSupportedPathTypes(test_func: anytype) !void {
 }
 
 test "Dir.readLink" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             // Create some targets
@@ -163,8 +161,6 @@ fn testReadLink(dir: Dir, target_path: []const u8, symlink_path: []const u8) !vo
 }
 
 test "relative symlink to parent directory" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -182,8 +178,6 @@ test "relative symlink to parent directory" {
 }
 
 test "openDir" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const allocator = ctx.arena.allocator();
@@ -202,8 +196,6 @@ test "openDir" {
 test "accessAbsolute" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -222,8 +214,6 @@ test "accessAbsolute" {
 test "openDirAbsolute" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -262,8 +252,6 @@ test "openDir non-cwd parent .." {
         else => {},
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -285,8 +273,6 @@ test "openDir non-cwd parent .." {
 test "readLinkAbsolute" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -337,8 +323,6 @@ fn testReadLinkAbsolute(target_path: []const u8, symlink_path: []const u8) !void
 }
 
 test "Dir.Iterator" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpIterableDir(.{});
     defer tmp_dir.cleanup();
 
@@ -369,8 +353,6 @@ test "Dir.Iterator" {
 }
 
 test "Dir.Iterator many entries" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpIterableDir(.{});
     defer tmp_dir.cleanup();
 
@@ -406,8 +388,6 @@ test "Dir.Iterator many entries" {
 }
 
 test "Dir.Iterator twice" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpIterableDir(.{});
     defer tmp_dir.cleanup();
 
@@ -441,8 +421,6 @@ test "Dir.Iterator twice" {
 }
 
 test "Dir.Iterator reset" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpIterableDir(.{});
     defer tmp_dir.cleanup();
 
@@ -479,8 +457,6 @@ test "Dir.Iterator reset" {
 }
 
 test "Dir.Iterator but dir is deleted during iteration" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = std.testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -523,8 +499,6 @@ fn contains(entries: *const std.ArrayList(IterableDir.Entry), el: IterableDir.En
 test "Dir.realpath smoke test" {
     if (!comptime std.os.isGetFdPathSupportedOnTarget(builtin.os)) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const allocator = ctx.arena.allocator();
@@ -575,8 +549,6 @@ test "Dir.realpath smoke test" {
 }
 
 test "readAllAlloc" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpDir(.{});
     defer tmp_dir.cleanup();
 
@@ -613,8 +585,6 @@ test "Dir.statFile" {
     // TODO: Re-enable once https://github.com/ziglang/zig/issues/17034 is solved
     if (builtin.os.tag == .linux and builtin.link_libc and builtin.abi == .gnu) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_file_name = try ctx.transformPath("test_file");
@@ -630,8 +600,6 @@ test "Dir.statFile" {
 }
 
 test "directory operations on files" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_file_name = try ctx.transformPath("test_file");
@@ -661,8 +629,6 @@ test "file operations on directories" {
     // TODO: fix this test on FreeBSD. https://github.com/ziglang/zig/issues/1759
     if (builtin.os.tag == .freebsd) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_dir_name = try ctx.transformPath("test_dir");
@@ -698,8 +664,6 @@ test "file operations on directories" {
 }
 
 test "makeOpenPath parent dirs do not exist" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpDir(.{});
     defer tmp_dir.cleanup();
 
@@ -712,8 +676,6 @@ test "makeOpenPath parent dirs do not exist" {
 }
 
 test "deleteDir" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_dir_path = try ctx.transformPath("test_dir");
@@ -735,8 +697,6 @@ test "deleteDir" {
 }
 
 test "Dir.rename files" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             // Rename on Windows can hit intermittent AccessDenied errors
@@ -779,8 +739,6 @@ test "Dir.rename files" {
 }
 
 test "Dir.rename directories" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             // Rename on Windows can hit intermittent AccessDenied errors
@@ -822,8 +780,6 @@ test "Dir.rename directory onto empty dir" {
     // TODO: Fix on Windows, see https://github.com/ziglang/zig/issues/6364
     if (builtin.os.tag == .windows) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_dir_path = try ctx.transformPath("test_dir");
@@ -845,8 +801,6 @@ test "Dir.rename directory onto non-empty dir" {
     // TODO: Fix on Windows, see https://github.com/ziglang/zig/issues/6364
     if (builtin.os.tag == .windows) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_dir_path = try ctx.transformPath("test_dir");
@@ -873,8 +827,6 @@ test "Dir.rename file <-> dir" {
     // TODO: Fix on Windows, see https://github.com/ziglang/zig/issues/6364
     if (builtin.os.tag == .windows) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const test_file_path = try ctx.transformPath("test_file");
@@ -890,8 +842,6 @@ test "Dir.rename file <-> dir" {
 }
 
 test "rename" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir1 = tmpDir(.{});
     defer tmp_dir1.cleanup();
 
@@ -914,8 +864,6 @@ test "rename" {
 test "renameAbsolute" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp_dir = tmpDir(.{});
     defer tmp_dir.cleanup();
 
@@ -974,8 +922,6 @@ test "openSelfExe" {
 }
 
 test "makePath, put some files in it, deleteTree" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const allocator = ctx.arena.allocator();
@@ -992,8 +938,6 @@ test "makePath, put some files in it, deleteTree" {
 }
 
 test "makePath, put some files in it, deleteTreeMinStackSize" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const allocator = ctx.arena.allocator();
@@ -1012,8 +956,6 @@ test "makePath, put some files in it, deleteTreeMinStackSize" {
 test "makePath in a directory that no longer exists" {
     if (builtin.os.tag == .windows) return error.SkipZigTest; // Windows returns FileBusy if attempting to remove an open dir
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
     try tmp.parent_dir.deleteTree(&tmp.sub_path);
@@ -1045,8 +987,6 @@ fn testFilenameLimits(iterable_dir: IterableDir, maxed_filename: []const u8) !vo
 }
 
 test "max file name component lengths" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpIterableDir(.{});
     defer tmp.cleanup();
 
@@ -1068,8 +1008,6 @@ test "max file name component lengths" {
 }
 
 test "writev, readv" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1112,8 +1050,6 @@ test "writev, readv" {
 }
 
 test "pwritev, preadv" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1155,8 +1091,6 @@ test "pwritev, preadv" {
 }
 
 test "access file" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const dir_path = try ctx.transformPath("os_test_tmp");
@@ -1173,8 +1107,6 @@ test "access file" {
 }
 
 test "sendfile" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1240,8 +1172,6 @@ test "sendfile" {
 }
 
 test "copyRangeAll" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1268,8 +1198,6 @@ test "copyRangeAll" {
 }
 
 test "copyFile" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const data = "u6wj+JmdF3qHsFPE BUlH2g4gJCmEz0PP";
@@ -1300,8 +1228,6 @@ fn expectFileContents(dir: Dir, file_path: []const u8, data: []const u8) !void {
 }
 
 test "AtomicFile" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const allocator = ctx.arena.allocator();
@@ -1328,8 +1254,6 @@ test "AtomicFile" {
 test "open file with exclusive nonblocking lock twice" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const filename = try ctx.transformPath("file_nonblocking_lock_test.txt");
@@ -1346,8 +1270,6 @@ test "open file with exclusive nonblocking lock twice" {
 test "open file with shared and exclusive nonblocking lock" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const filename = try ctx.transformPath("file_nonblocking_lock_test.txt");
@@ -1364,8 +1286,6 @@ test "open file with shared and exclusive nonblocking lock" {
 test "open file with exclusive and shared nonblocking lock" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testWithAllSupportedPathTypes(struct {
         fn impl(ctx: *TestContext) !void {
             const filename = try ctx.transformPath("file_nonblocking_lock_test.txt");
@@ -1380,8 +1300,6 @@ test "open file with exclusive and shared nonblocking lock" {
 }
 
 test "open file with exclusive lock twice, make sure second lock waits" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.single_threaded) return error.SkipZigTest;
 
     if (std.io.is_async) {
@@ -1432,8 +1350,6 @@ test "open file with exclusive lock twice, make sure second lock waits" {
 test "open file with exclusive nonblocking lock twice (absolute paths)" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var random_bytes: [12]u8 = undefined;
     std.crypto.random.bytes(&random_bytes);
 
@@ -1468,8 +1384,6 @@ test "open file with exclusive nonblocking lock twice (absolute paths)" {
 test "walker" {
     if (builtin.os.tag == .wasi and builtin.link_libc) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpIterableDir(.{});
     defer tmp.cleanup();
 
@@ -1523,8 +1437,6 @@ test "walker" {
 test "walker without fully iterating" {
     if (builtin.os.tag == .wasi and builtin.link_libc) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpIterableDir(.{});
     defer tmp.cleanup();
 
@@ -1637,8 +1549,6 @@ test "chmod" {
     if (builtin.os.tag == .windows or builtin.os.tag == .wasi)
         return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1661,8 +1571,6 @@ test "chown" {
     if (builtin.os.tag == .windows or builtin.os.tag == .wasi)
         return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1678,8 +1586,6 @@ test "chown" {
 }
 
 test "File.Metadata" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1763,8 +1669,6 @@ test "delete a read-only file on windows" {
     if (builtin.os.tag != .windows)
         return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1795,8 +1699,6 @@ test "delete a read-only file on windows" {
 test "delete a setAsCwd directory on Windows" {
     if (builtin.os.tag != .windows) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     // Set tmp dir as current working directory.
     try tmp.dir.setAsCwd();
lib/std/hash/xxhash.zig
@@ -862,8 +862,6 @@ test "xxhash3 iterative api" {
 }
 
 test "xxhash64" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const H = XxHash64;
     try testExpect(H, 0, "", 0xef46db3751d8e999);
     try testExpect(H, 0, "a", 0xd24ec4f1a98c6e5b);
@@ -875,8 +873,6 @@ test "xxhash64" {
 }
 
 test "xxhash64 smhasher" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const Test = struct {
         fn do() !void {
             try expectEqual(verify.smhasher(XxHash64.hash), 0x024B7CF4);
@@ -888,8 +884,6 @@ test "xxhash64 smhasher" {
 }
 
 test "xxhash64 iterative api" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const Test = struct {
         fn do() !void {
             try verify.iterativeApi(XxHash64);
@@ -901,8 +895,6 @@ test "xxhash64 iterative api" {
 }
 
 test "xxhash32" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const H = XxHash32;
 
     try testExpect(H, 0, "", 0x02cc5d05);
@@ -915,8 +907,6 @@ test "xxhash32" {
 }
 
 test "xxhash32 smhasher" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const Test = struct {
         fn do() !void {
             try expectEqual(verify.smhasher(XxHash32.hash), 0xBA88B743);
@@ -928,8 +918,6 @@ test "xxhash32 smhasher" {
 }
 
 test "xxhash32 iterative api" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const Test = struct {
         fn do() !void {
             try verify.iterativeApi(XxHash32);
lib/std/heap/general_purpose_allocator.zig
@@ -1042,8 +1042,6 @@ const TraceKind = enum {
 const test_config = Config{};
 
 test "small allocations - free in same order" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var gpa = GeneralPurposeAllocator(test_config){};
     defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
     const allocator = gpa.allocator();
@@ -1063,8 +1061,6 @@ test "small allocations - free in same order" {
 }
 
 test "small allocations - free in reverse order" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var gpa = GeneralPurposeAllocator(test_config){};
     defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
     const allocator = gpa.allocator();
lib/std/http/Client.zig
@@ -491,8 +491,6 @@ pub const Response = struct {
     }
 
     test parseInt3 {
-        if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
         const expectEqual = testing.expectEqual;
         try expectEqual(@as(u10, 0), parseInt3("000".*));
         try expectEqual(@as(u10, 418), parseInt3("418".*));
lib/std/io/multi_writer.zig
@@ -36,7 +36,6 @@ pub fn multiWriter(streams: anytype) MultiWriter(@TypeOf(streams)) {
 const testing = std.testing;
 
 test "MultiWriter" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
     var tmp = testing.tmpDir(.{});
     defer tmp.cleanup();
     var f = try tmp.dir.createFile("t.txt", .{});
lib/std/io/test.zig
@@ -15,8 +15,6 @@ const native_endian = builtin.target.cpu.arch.endian();
 const tmpDir = std.testing.tmpDir;
 
 test "write a file, read it, then delete it" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -110,8 +108,6 @@ test "BitStreams with File Stream" {
 }
 
 test "File seek ops" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -139,8 +135,6 @@ test "File seek ops" {
 }
 
 test "setEndPos" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
lib/std/math/big/int_test.zig
@@ -71,8 +71,6 @@ test "big.int set negative minimum" {
 }
 
 test "big.int set double-width maximum then zero" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var a = try Managed.initSet(testing.allocator, maxInt(DoubleLimb));
     defer a.deinit();
     try a.set(@as(DoubleLimb, 0));
lib/std/math/ilogb.zig
@@ -125,8 +125,6 @@ test "80" {
 }
 
 test "128" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try expect(ilogbX(f128, 0.0) == fp_ilogb0);
     try expect(ilogbX(f128, 0.5) == -1);
     try expect(ilogbX(f128, 0.8923) == -1);
@@ -162,8 +160,6 @@ test "64 special" {
 }
 
 test "80 special" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try expect(ilogbX(f80, math.inf(f80)) == maxInt(i32));
     try expect(ilogbX(f80, -math.inf(f80)) == maxInt(i32));
     try expect(ilogbX(f80, 0.0) == minInt(i32));
@@ -171,8 +167,6 @@ test "80 special" {
 }
 
 test "128 special" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try expect(ilogbX(f128, math.inf(f128)) == maxInt(i32));
     try expect(ilogbX(f128, -math.inf(f128)) == maxInt(i32));
     try expect(ilogbX(f128, 0.0) == minInt(i32));
lib/std/math/scalbn.zig
@@ -7,8 +7,6 @@ const expect = std.testing.expect;
 pub const scalbn = @import("ldexp.zig").ldexp;
 
 test "math.scalbn" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // Verify we are using base 2.
     try expect(scalbn(@as(f16, 1.5), 4) == 24.0);
     try expect(scalbn(@as(f32, 1.5), 4) == 24.0);
lib/std/net/test.zig
@@ -294,8 +294,6 @@ test "listen on a unix socket, send bytes, receive bytes" {
     if (builtin.single_threaded) return error.SkipZigTest;
     if (!net.has_unix_sockets) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.os.tag == .windows) {
         _ = try std.os.windows.WSAStartup(2, 2);
     }
lib/std/os/linux/io_uring.zig
@@ -1741,8 +1741,6 @@ test "readv" {
 test "writev/fsync/readv" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(4, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -1813,8 +1811,6 @@ test "writev/fsync/readv" {
 test "write/read" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(2, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -1862,8 +1858,6 @@ test "write/read" {
 test "splice/read" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(4, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -1935,8 +1929,6 @@ test "splice/read" {
 test "write_fixed/read_fixed" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(2, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2002,8 +1994,6 @@ test "write_fixed/read_fixed" {
 test "openat" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2055,8 +2045,6 @@ test "openat" {
 test "close" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2392,8 +2380,6 @@ test "accept/connect/recv/link_timeout" {
 test "fallocate" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2440,8 +2426,6 @@ test "fallocate" {
 test "statx" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2698,8 +2682,6 @@ test "shutdown" {
 test "renameat" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2769,8 +2751,6 @@ test "renameat" {
 test "unlinkat" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2823,8 +2803,6 @@ test "unlinkat" {
 test "mkdirat" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2869,8 +2847,6 @@ test "mkdirat" {
 test "symlinkat" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
@@ -2919,8 +2895,6 @@ test "symlinkat" {
 test "linkat" {
     if (builtin.os.tag != .linux) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var ring = IO_Uring.init(1, 0) catch |err| switch (err) {
         error.SystemOutdated => return error.SkipZigTest,
         error.PermissionDenied => return error.SkipZigTest,
lib/std/os/linux/test.zig
@@ -8,8 +8,6 @@ const expectEqual = std.testing.expectEqual;
 const fs = std.fs;
 
 test "fallocate" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = std.testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -71,8 +69,6 @@ test "timer" {
 }
 
 test "statx" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = std.testing.tmpDir(.{});
     defer tmp.cleanup();
 
@@ -111,8 +107,6 @@ test "user and group ids" {
 }
 
 test "fadvise" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = std.testing.tmpDir(.{});
     defer tmp.cleanup();
 
lib/std/os/test.zig
@@ -88,8 +88,6 @@ test "chdir smoke test" {
 test "open smoke test" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // TODO verify file attributes using `fstat`
 
     var tmp = tmpDir(.{});
@@ -144,8 +142,6 @@ test "open smoke test" {
 test "openat smoke test" {
     if (native_os == .wasi and builtin.link_libc) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // TODO verify file attributes using `fstatat`
 
     var tmp = tmpDir(.{});
@@ -280,8 +276,6 @@ test "link with relative paths" {
 test "linkat with different directories" {
     if (native_os == .wasi and builtin.link_libc) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     switch (native_os) {
         .wasi, .linux, .solaris, .illumos => {},
         else => return error.SkipZigTest,
@@ -327,8 +321,6 @@ test "fstatat" {
     // enable when `fstat` and `fstatat` are implemented on Windows
     if (native_os == .windows) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -348,8 +340,6 @@ test "fstatat" {
 }
 
 test "readlinkat" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -583,8 +573,6 @@ test "mmap" {
     if (native_os == .windows or native_os == .wasi)
         return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -695,8 +683,6 @@ test "fcntl" {
     if (native_os == .windows or native_os == .wasi)
         return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -737,8 +723,6 @@ test "sync" {
     if (native_os != .linux)
         return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -759,8 +743,6 @@ test "fsync" {
         else => return error.SkipZigTest,
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -780,8 +762,6 @@ test "getrlimit and setrlimit" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     inline for (std.meta.fields(os.rlimit_resource)) |field| {
         const resource = @as(os.rlimit_resource, @enumFromInt(field.value));
         const limit = try os.getrlimit(resource);
@@ -903,8 +883,6 @@ test "dup & dup2" {
         else => return error.SkipZigTest,
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -934,8 +912,6 @@ test "dup & dup2" {
 test "writev longer than IOV_MAX" {
     if (native_os == .windows or native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -953,8 +929,6 @@ test "POSIX file locking with fcntl" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (true) {
         // https://github.com/ziglang/zig/issues/11074
         return error.SkipZigTest;
@@ -1017,8 +991,6 @@ test "POSIX file locking with fcntl" {
 test "rename smoke test" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1075,8 +1047,6 @@ test "rename smoke test" {
 test "access smoke test" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1141,8 +1111,6 @@ test "timerfd" {
 }
 
 test "isatty" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1155,8 +1123,6 @@ test "isatty" {
 test "read with empty buffer" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1182,8 +1148,6 @@ test "read with empty buffer" {
 test "pread with empty buffer" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1209,8 +1173,6 @@ test "pread with empty buffer" {
 test "write with empty buffer" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1236,8 +1198,6 @@ test "write with empty buffer" {
 test "pwrite with empty buffer" {
     if (native_os == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
@@ -1263,8 +1223,6 @@ test "pwrite with empty buffer" {
 test "fchmodat smoke test" {
     if (!std.fs.has_executable_bit) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var tmp = tmpDir(.{});
     defer tmp.cleanup();
 
lib/std/os/windows.zig
@@ -2491,8 +2491,6 @@ pub fn ntToWin32Namespace(path: []const u16) !PathSpace {
 }
 
 test "ntToWin32Namespace" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const L = std.unicode.utf8ToUtf16LeStringLiteral;
 
     try testNtToWin32Namespace(L("UNC"), L("\\??\\UNC"));
lib/std/Thread/Condition.zig
@@ -324,6 +324,8 @@ test "Condition - wait and signal" {
         return error.SkipZigTest;
     }
 
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+
     const num_threads = 4;
 
     const MultiWait = struct {
@@ -369,8 +371,6 @@ test "Condition - signal" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_threads = 4;
 
     const SignalTest = struct {
lib/std/Thread/Semaphore.zig
@@ -39,6 +39,8 @@ test "Thread.Semaphore" {
         return error.SkipZigTest;
     }
 
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+
     const TestContext = struct {
         sem: *Semaphore,
         n: *i32,
lib/std/base64.zig
@@ -363,8 +363,6 @@ test "base64" {
 }
 
 test "base64 padding dest overflow" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const input = "foo";
 
     var expect: [128]u8 = undefined;
lib/std/crypto.zig
@@ -314,8 +314,6 @@ test "CSPRNG" {
 }
 
 test "issue #4532: no index out of bounds" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const types = [_]type{
         hash.Md5,
         hash.Sha1,
lib/std/fs.zig
@@ -3244,15 +3244,15 @@ fn copy_file(fd_in: os.fd_t, fd_out: os.fd_t, maybe_size: ?u64) CopyFileRawError
 }
 
 test {
-    if (builtin.zig_backend != .stage2_x86_64) {
-        if (builtin.os.tag != .wasi) {
-            _ = &makeDirAbsolute;
-            _ = &makeDirAbsoluteZ;
-            _ = &copyFileAbsolute;
+    if (builtin.os.tag != .wasi) {
+        _ = &makeDirAbsolute;
+        _ = &makeDirAbsoluteZ;
+        _ = &copyFileAbsolute;
+        if (builtin.zig_backend != .stage2_x86_64) {
             _ = &updateFileAbsolute;
         }
-        _ = &Dir.copyFile;
     }
+    _ = &Dir.copyFile;
     _ = @import("fs/test.zig");
     _ = @import("fs/path.zig");
     _ = @import("fs/file.zig");
lib/std/hash_map.zig
@@ -1873,8 +1873,6 @@ test "std.hash_map multiple removes on same metadata" {
 }
 
 test "std.hash_map put and remove loop in random order" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var map = AutoHashMap(u32, u32).init(std.testing.allocator);
     defer map.deinit();
 
lib/std/mem.zig
@@ -2060,10 +2060,7 @@ pub fn writeVarPackedInt(bytes: []u8, bit_offset: usize, bit_count: usize, value
 }
 
 test "writeIntBig and writeIntLittle" {
-    switch (builtin.zig_backend) {
-        .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
-        else => {},
-    }
+    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
 
     var buf0: [0]u8 = undefined;
     var buf1: [1]u8 = undefined;
@@ -4664,6 +4661,7 @@ test "read/write(Var)PackedInt" {
         .stage2_c, .stage2_x86_64 => return error.SkipZigTest,
         else => {},
     }
+
     switch (builtin.cpu.arch) {
         // This test generates too much code to execute on WASI.
         // LLVM backend fails with "too many locals: locals exceed maximum"
lib/std/priority_dequeue.zig
@@ -705,8 +705,6 @@ test "std.PriorityDequeue: fromOwnedSlice trivial case 1" {
 }
 
 test "std.PriorityDequeue: fromOwnedSlice" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const items = [_]u32{ 15, 7, 21, 14, 13, 22, 12, 6, 7, 25, 5, 24, 11, 16, 15, 24, 2, 1 };
     const queue_items = try testing.allocator.dupe(u32, items[0..]);
     var queue = PDQ.fromOwnedSlice(testing.allocator, queue_items[0..], {});
lib/std/priority_queue.zig
@@ -385,8 +385,6 @@ test "std.PriorityQueue: fromOwnedSlice trivial case 1" {
 }
 
 test "std.PriorityQueue: fromOwnedSlice" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const items = [_]u32{ 15, 7, 21, 14, 13, 22, 12, 6, 7, 25, 5, 24, 11, 16, 15, 24, 2, 1 };
     const heap_items = try testing.allocator.dupe(u32, items[0..]);
     var queue = PQlt.fromOwnedSlice(testing.allocator, heap_items[0..], {});
lib/std/time.zig
@@ -321,8 +321,6 @@ pub const Timer = struct {
 };
 
 test "Timer + Instant" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const margin = ns_per_ms * 150;
 
     var timer = try Timer.start();
src/arch/x86_64/CodeGen.zig
@@ -1460,7 +1460,10 @@ fn asmRegisterRegisterRegisterImmediate(
             .r1 = reg1,
             .r2 = reg2,
             .r3 = reg3,
-            .i = @as(u8, @intCast(imm.unsigned)),
+            .i = switch (imm) {
+                .signed => |s| @bitCast(@as(i8, @intCast(s))),
+                .unsigned => |u| @intCast(u),
+            },
         } },
     });
 }
@@ -2084,7 +2087,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .get_union_tag   => try self.airGetUnionTag(inst),
             .clz             => try self.airClz(inst),
             .ctz             => try self.airCtz(inst),
-            .popcount        => try self.airPopcount(inst),
+            .popcount        => try self.airPopCount(inst),
             .byte_swap       => try self.airByteSwap(inst),
             .bit_reverse     => try self.airBitReverse(inst),
             .tag_name        => try self.airTagName(inst),
@@ -3019,11 +3022,8 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
         else if (dst_abi_size <= 8)
             try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv)
         else if (dst_abi_size <= 16) dst: {
-            const dst_regs = try self.register_manager.allocRegs(
-                2,
-                .{ inst, inst },
-                abi.RegisterClass.gp,
-            );
+            const dst_regs =
+                try self.register_manager.allocRegs(2, .{ inst, inst }, abi.RegisterClass.gp);
             const dst_mcv: MCValue = .{ .register_pair = dst_regs };
             const dst_locks = self.register_manager.lockRegsAssumeUnused(2, dst_regs);
             defer for (dst_locks) |lock| self.register_manager.unlockReg(lock);
@@ -4671,6 +4671,19 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void {
         if (src_ty.zigTypeTag(mod) == .Vector) return self.fail("TODO implement airClz for {}", .{
             src_ty.fmt(mod),
         });
+        const src_bits: u32 = @intCast(src_ty.bitSize(mod));
+
+        const has_lzcnt = self.hasFeature(.lzcnt);
+        if (src_bits > 64 and !has_lzcnt) {
+            var callee_buf: ["__clz?i2".len]u8 = undefined;
+            break :result try self.genCall(.{ .lib = .{
+                .return_type = .i32_type,
+                .param_types = &.{src_ty.toIntern()},
+                .callee = std.fmt.bufPrint(&callee_buf, "__clz{c}i2", .{
+                    intCompilerRtAbiName(src_bits),
+                }) catch unreachable,
+            } }, &.{src_ty}, &.{.{ .air_ref = ty_op.operand }});
+        }
 
         const src_mcv = try self.resolveInst(ty_op.operand);
         const mat_src_mcv = switch (src_mcv) {
@@ -4688,8 +4701,7 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void {
         const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
         defer self.register_manager.unlockReg(dst_lock);
 
-        const src_bits = src_ty.bitSize(mod);
-        if (self.hasFeature(.lzcnt)) {
+        if (has_lzcnt) {
             if (src_bits <= 8) {
                 const wide_reg = try self.copyToTmpRegister(src_ty, mat_src_mcv);
                 try self.truncateRegister(src_ty, wide_reg);
@@ -4712,24 +4724,33 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void {
                 const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
                 defer self.register_manager.unlockReg(tmp_lock);
 
-                try self.genBinOpMir(.{ ._, .lzcnt }, Type.u64, dst_mcv, mat_src_mcv);
+                try self.genBinOpMir(
+                    .{ ._, .lzcnt },
+                    Type.u64,
+                    dst_mcv,
+                    if (mat_src_mcv.isMemory())
+                        mat_src_mcv
+                    else
+                        .{ .register = mat_src_mcv.register_pair[0] },
+                );
                 try self.genBinOpMir(.{ ._, .add }, dst_ty, dst_mcv, .{ .immediate = 64 });
                 try self.genBinOpMir(
                     .{ ._, .lzcnt },
                     Type.u64,
                     tmp_mcv,
-                    mat_src_mcv.address().offset(8).deref(),
+                    if (mat_src_mcv.isMemory())
+                        mat_src_mcv.address().offset(8).deref()
+                    else
+                        .{ .register = mat_src_mcv.register_pair[1] },
                 );
                 try self.asmCmovccRegisterRegister(.nc, dst_reg.to32(), tmp_reg.to32());
 
-                if (src_bits < 128) {
-                    try self.genBinOpMir(
-                        .{ ._, .sub },
-                        dst_ty,
-                        dst_mcv,
-                        .{ .immediate = 128 - src_bits },
-                    );
-                }
+                if (src_bits < 128) try self.genBinOpMir(
+                    .{ ._, .sub },
+                    dst_ty,
+                    dst_mcv,
+                    .{ .immediate = 128 - src_bits },
+                );
             } else return self.fail("TODO airClz of {}", .{src_ty.fmt(mod)});
             break :result dst_mcv;
         }
@@ -4800,7 +4821,22 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
     const result = result: {
         const dst_ty = self.typeOfIndex(inst);
         const src_ty = self.typeOf(ty_op.operand);
-        const src_bits = src_ty.bitSize(mod);
+        if (src_ty.zigTypeTag(mod) == .Vector) return self.fail("TODO implement airClz for {}", .{
+            src_ty.fmt(mod),
+        });
+        const src_bits: u32 = @intCast(src_ty.bitSize(mod));
+
+        const has_bmi = self.hasFeature(.bmi);
+        if (src_bits > 64 and !has_bmi) {
+            var callee_buf: ["__ctz?i2".len]u8 = undefined;
+            break :result try self.genCall(.{ .lib = .{
+                .return_type = .i32_type,
+                .param_types = &.{src_ty.toIntern()},
+                .callee = std.fmt.bufPrint(&callee_buf, "__ctz{c}i2", .{
+                    intCompilerRtAbiName(src_bits),
+                }) catch unreachable,
+            } }, &.{src_ty}, &.{.{ .air_ref = ty_op.operand }});
+        }
 
         const src_mcv = try self.resolveInst(ty_op.operand);
         const mat_src_mcv = switch (src_mcv) {
@@ -4845,8 +4881,16 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
                 const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
                 defer self.register_manager.unlockReg(tmp_lock);
 
+                const lo_mat_src_mcv: MCValue = if (mat_src_mcv.isMemory())
+                    mat_src_mcv
+                else
+                    .{ .register = mat_src_mcv.register_pair[0] };
+                const hi_mat_src_mcv: MCValue = if (mat_src_mcv.isMemory())
+                    mat_src_mcv.address().offset(8).deref()
+                else
+                    .{ .register = mat_src_mcv.register_pair[1] };
                 const masked_mcv = if (src_bits < 128) masked: {
-                    try self.genCopy(Type.u64, dst_mcv, mat_src_mcv.address().offset(8).deref());
+                    try self.genCopy(Type.u64, dst_mcv, hi_mat_src_mcv);
                     try self.genBinOpMir(
                         .{ ._, .@"or" },
                         Type.u64,
@@ -4854,10 +4898,10 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
                         .{ .immediate = @as(u64, math.maxInt(u64)) << @intCast(src_bits - 64) },
                     );
                     break :masked dst_mcv;
-                } else mat_src_mcv.address().offset(8).deref();
+                } else hi_mat_src_mcv;
                 try self.genBinOpMir(.{ ._, .tzcnt }, Type.u64, dst_mcv, masked_mcv);
                 try self.genBinOpMir(.{ ._, .add }, dst_ty, dst_mcv, .{ .immediate = 64 });
-                try self.genBinOpMir(.{ ._, .tzcnt }, Type.u64, tmp_mcv, mat_src_mcv);
+                try self.genBinOpMir(.{ ._, .tzcnt }, Type.u64, tmp_mcv, lo_mat_src_mcv);
                 try self.asmCmovccRegisterRegister(.nc, dst_reg.to32(), tmp_reg.to32());
             } else return self.fail("TODO airCtz of {}", .{src_ty.fmt(mod)});
             break :result dst_mcv;
@@ -4889,155 +4933,216 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
-fn airPopcount(self: *Self, inst: Air.Inst.Index) !void {
+fn airPopCount(self: *Self, inst: Air.Inst.Index) !void {
     const mod = self.bin_file.options.module.?;
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
     const result: MCValue = result: {
+        try self.spillEflagsIfOccupied();
+
         const src_ty = self.typeOf(ty_op.operand);
         const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
-        if (src_ty.zigTypeTag(mod) == .Vector or src_abi_size > 8)
-            return self.fail("TODO implement airPopcount for {}", .{src_ty.fmt(mod)});
+        if (src_ty.zigTypeTag(mod) == .Vector or src_abi_size > 16)
+            return self.fail("TODO implement airPopCount for {}", .{src_ty.fmt(mod)});
         const src_mcv = try self.resolveInst(ty_op.operand);
 
-        if (self.hasFeature(.popcnt)) {
-            const mat_src_mcv = switch (src_mcv) {
-                .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
-                else => src_mcv,
-            };
-            const mat_src_lock = switch (mat_src_mcv) {
-                .register => |reg| self.register_manager.lockReg(reg),
-                else => null,
-            };
-            defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock);
+        const mat_src_mcv = switch (src_mcv) {
+            .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
+            else => src_mcv,
+        };
+        const mat_src_lock = switch (mat_src_mcv) {
+            .register => |reg| self.register_manager.lockReg(reg),
+            else => null,
+        };
+        defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock);
 
-            const dst_mcv: MCValue =
-                if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
-                src_mcv
+        if (src_abi_size <= 8) {
+            const dst_contains_src =
+                src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv);
+            const dst_reg = if (dst_contains_src)
+                src_mcv.getReg().?
             else
-                .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.gp) };
+                try self.register_manager.allocReg(inst, abi.RegisterClass.gp);
+            const dst_lock = self.register_manager.lockReg(dst_reg);
+            defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
-            const popcnt_ty = if (src_abi_size > 1) src_ty else Type.u16;
-            try self.genBinOpMir(.{ ._, .popcnt }, popcnt_ty, dst_mcv, mat_src_mcv);
-            break :result dst_mcv;
+            try self.genPopCount(dst_reg, src_ty, mat_src_mcv, dst_contains_src);
+            break :result .{ .register = dst_reg };
         }
 
-        const mask = @as(u64, math.maxInt(u64)) >> @intCast(64 - src_abi_size * 8);
-        const imm_0_1 = Immediate.u(mask / 0b1_1);
-        const imm_00_11 = Immediate.u(mask / 0b01_01);
-        const imm_0000_1111 = Immediate.u(mask / 0b0001_0001);
-        const imm_0000_0001 = Immediate.u(mask / 0b1111_1111);
+        assert(src_abi_size > 8 and src_abi_size <= 16);
+        const tmp_regs = try self.register_manager.allocRegs(2, .{ inst, null }, abi.RegisterClass.gp);
+        const tmp_locks = self.register_manager.lockRegsAssumeUnused(2, tmp_regs);
+        defer for (tmp_locks) |lock| self.register_manager.unlockReg(lock);
 
-        const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
-            src_mcv
+        try self.genPopCount(tmp_regs[0], Type.usize, if (mat_src_mcv.isMemory())
+            mat_src_mcv
         else
-            try self.copyToRegisterWithInstTracking(inst, src_ty, src_mcv);
-        const dst_reg = dst_mcv.register;
-        const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
-        defer self.register_manager.unlockReg(dst_lock);
+            .{ .register = mat_src_mcv.register_pair[0] }, false);
+        try self.genPopCount(tmp_regs[1], Type.usize, if (mat_src_mcv.isMemory())
+            mat_src_mcv.address().offset(8).deref()
+        else
+            .{ .register = mat_src_mcv.register_pair[1] }, false);
+        try self.asmRegisterRegister(.{ ._, .add }, tmp_regs[0].to8(), tmp_regs[1].to8());
+        break :result .{ .register = tmp_regs[0] };
+    };
+    return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+}
 
-        const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-        defer self.register_manager.unlockReg(tmp_lock);
+fn genPopCount(
+    self: *Self,
+    dst_reg: Register,
+    src_ty: Type,
+    src_mcv: MCValue,
+    dst_contains_src: bool,
+) !void {
+    const mod = self.bin_file.options.module.?;
 
-        {
-            const dst = registerAlias(dst_reg, src_abi_size);
-            const tmp = registerAlias(tmp_reg, src_abi_size);
-            const imm = if (src_abi_size > 4)
-                try self.register_manager.allocReg(null, abi.RegisterClass.gp)
-            else
-                undefined;
-
-            // dst = operand
-            try self.asmRegisterRegister(.{ ._, .mov }, tmp, dst);
-            // tmp = operand
-            try self.asmRegisterImmediate(.{ ._r, .sh }, tmp, Immediate.u(1));
-            // tmp = operand >> 1
-            if (src_abi_size > 4) {
-                try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_0_1);
-                try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
-            } else try self.asmRegisterImmediate(.{ ._, .@"and" }, tmp, imm_0_1);
-            // tmp = (operand >> 1) & 0x55...55
-            try self.asmRegisterRegister(.{ ._, .sub }, dst, tmp);
-            // dst = temp1 = operand - ((operand >> 1) & 0x55...55)
-            try self.asmRegisterRegister(.{ ._, .mov }, tmp, dst);
-            // tmp = temp1
-            try self.asmRegisterImmediate(.{ ._r, .sh }, dst, Immediate.u(2));
-            // dst = temp1 >> 2
-            if (src_abi_size > 4) {
-                try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_00_11);
-                try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
-                try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
-            } else {
-                try self.asmRegisterImmediate(.{ ._, .@"and" }, tmp, imm_00_11);
-                try self.asmRegisterImmediate(.{ ._, .@"and" }, dst, imm_00_11);
-            }
-            // tmp = temp1 & 0x33...33
-            // dst = (temp1 >> 2) & 0x33...33
-            try self.asmRegisterRegister(.{ ._, .add }, tmp, dst);
-            // tmp = temp2 = (temp1 & 0x33...33) + ((temp1 >> 2) & 0x33...33)
-            try self.asmRegisterRegister(.{ ._, .mov }, dst, tmp);
-            // dst = temp2
-            try self.asmRegisterImmediate(.{ ._r, .sh }, tmp, Immediate.u(4));
-            // tmp = temp2 >> 4
-            try self.asmRegisterRegister(.{ ._, .add }, dst, tmp);
-            // dst = temp2 + (temp2 >> 4)
-            if (src_abi_size > 4) {
-                try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_0000_1111);
-                try self.asmRegisterImmediate(.{ ._, .mov }, tmp, imm_0000_0001);
-                try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
-                try self.asmRegisterRegister(.{ .i_, .mul }, dst, tmp);
-            } else {
-                try self.asmRegisterImmediate(.{ ._, .@"and" }, dst, imm_0000_1111);
-                if (src_abi_size > 1) {
-                    try self.asmRegisterRegisterImmediate(.{ .i_, .mul }, dst, dst, imm_0000_0001);
-                }
-            }
-            // dst = temp3 = (temp2 + (temp2 >> 4)) & 0x0f...0f
-            // dst = temp3 * 0x01...01
-            if (src_abi_size > 1) {
-                try self.asmRegisterImmediate(.{ ._r, .sh }, dst, Immediate.u((src_abi_size - 1) * 8));
-            }
-            // dst = (temp3 * 0x01...01) >> (bits - 8)
+    const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
+    if (self.hasFeature(.popcnt)) return self.genBinOpMir(
+        .{ ._, .popcnt },
+        if (src_abi_size > 1) src_ty else Type.u16,
+        .{ .register = dst_reg },
+        src_mcv,
+    );
+
+    const mask = @as(u64, math.maxInt(u64)) >> @intCast(64 - src_abi_size * 8);
+    const imm_0_1 = Immediate.u(mask / 0b1_1);
+    const imm_00_11 = Immediate.u(mask / 0b01_01);
+    const imm_0000_1111 = Immediate.u(mask / 0b0001_0001);
+    const imm_0000_0001 = Immediate.u(mask / 0b1111_1111);
+
+    const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+    const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+    defer self.register_manager.unlockReg(tmp_lock);
+
+    const dst = registerAlias(dst_reg, src_abi_size);
+    const tmp = registerAlias(tmp_reg, src_abi_size);
+    const imm = if (src_abi_size > 4)
+        try self.register_manager.allocReg(null, abi.RegisterClass.gp)
+    else
+        undefined;
+
+    if (!dst_contains_src) try self.genSetReg(dst, src_ty, src_mcv);
+    // dst = operand
+    try self.asmRegisterRegister(.{ ._, .mov }, tmp, dst);
+    // tmp = operand
+    try self.asmRegisterImmediate(.{ ._r, .sh }, tmp, Immediate.u(1));
+    // tmp = operand >> 1
+    if (src_abi_size > 4) {
+        try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_0_1);
+        try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
+    } else try self.asmRegisterImmediate(.{ ._, .@"and" }, tmp, imm_0_1);
+    // tmp = (operand >> 1) & 0x55...55
+    try self.asmRegisterRegister(.{ ._, .sub }, dst, tmp);
+    // dst = temp1 = operand - ((operand >> 1) & 0x55...55)
+    try self.asmRegisterRegister(.{ ._, .mov }, tmp, dst);
+    // tmp = temp1
+    try self.asmRegisterImmediate(.{ ._r, .sh }, dst, Immediate.u(2));
+    // dst = temp1 >> 2
+    if (src_abi_size > 4) {
+        try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_00_11);
+        try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
+        try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
+    } else {
+        try self.asmRegisterImmediate(.{ ._, .@"and" }, tmp, imm_00_11);
+        try self.asmRegisterImmediate(.{ ._, .@"and" }, dst, imm_00_11);
+    }
+    // tmp = temp1 & 0x33...33
+    // dst = (temp1 >> 2) & 0x33...33
+    try self.asmRegisterRegister(.{ ._, .add }, tmp, dst);
+    // tmp = temp2 = (temp1 & 0x33...33) + ((temp1 >> 2) & 0x33...33)
+    try self.asmRegisterRegister(.{ ._, .mov }, dst, tmp);
+    // dst = temp2
+    try self.asmRegisterImmediate(.{ ._r, .sh }, tmp, Immediate.u(4));
+    // tmp = temp2 >> 4
+    try self.asmRegisterRegister(.{ ._, .add }, dst, tmp);
+    // dst = temp2 + (temp2 >> 4)
+    if (src_abi_size > 4) {
+        try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_0000_1111);
+        try self.asmRegisterImmediate(.{ ._, .mov }, tmp, imm_0000_0001);
+        try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
+        try self.asmRegisterRegister(.{ .i_, .mul }, dst, tmp);
+    } else {
+        try self.asmRegisterImmediate(.{ ._, .@"and" }, dst, imm_0000_1111);
+        if (src_abi_size > 1) {
+            try self.asmRegisterRegisterImmediate(.{ .i_, .mul }, dst, dst, imm_0000_0001);
         }
-        break :result dst_mcv;
-    };
-    return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+    }
+    // dst = temp3 = (temp2 + (temp2 >> 4)) & 0x0f...0f
+    // dst = temp3 * 0x01...01
+    if (src_abi_size > 1) {
+        try self.asmRegisterImmediate(.{ ._r, .sh }, dst, Immediate.u((src_abi_size - 1) * 8));
+    }
+    // dst = (temp3 * 0x01...01) >> (bits - 8)
 }
 
-fn byteSwap(self: *Self, inst: Air.Inst.Index, src_ty: Type, src_mcv: MCValue, mem_ok: bool) !MCValue {
+fn genByteSwap(
+    self: *Self,
+    inst: Air.Inst.Index,
+    src_ty: Type,
+    src_mcv: MCValue,
+    mem_ok: bool,
+) !MCValue {
     const mod = self.bin_file.options.module.?;
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
 
     if (src_ty.zigTypeTag(mod) == .Vector) return self.fail(
-        "TODO implement byteSwap for {}",
+        "TODO implement genByteSwap for {}",
         .{src_ty.fmt(mod)},
     );
-    const src_bits = self.regBitSize(src_ty);
+    const abi_size: u32 = @intCast(src_ty.abiSize(mod));
     const src_lock = switch (src_mcv) {
         .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
         else => null,
     };
     defer if (src_lock) |lock| self.register_manager.unlockReg(lock);
 
-    switch (src_bits) {
-        else => return self.fail("TODO implement byteSwap for {}", .{
+    switch (abi_size) {
+        else => return self.fail("TODO implement genByteSwap for {}", .{
             src_ty.fmt(mod),
         }),
-        8 => return if ((mem_ok or src_mcv.isRegister()) and
+        1 => return if ((mem_ok or src_mcv.isRegister()) and
             self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
             src_mcv
         else
             try self.copyToRegisterWithInstTracking(inst, src_ty, src_mcv),
-        16 => if ((mem_ok or src_mcv.isRegister()) and
+        2 => if ((mem_ok or src_mcv.isRegister()) and
             self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
         {
             try self.genBinOpMir(.{ ._l, .ro }, src_ty, src_mcv, .{ .immediate = 8 });
             return src_mcv;
         },
-        32, 64 => if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
+        3...8 => if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
             try self.genUnOpMir(.{ ._, .bswap }, src_ty, src_mcv);
             return src_mcv;
         },
+        9...16 => {
+            switch (src_mcv) {
+                .register_pair => |src_regs| if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
+                    for (src_regs) |src_reg| try self.asmRegister(.{ ._, .bswap }, src_reg.to64());
+                    return .{ .register_pair = .{ src_regs[1], src_regs[0] } };
+                },
+                else => {},
+            }
+
+            const dst_regs =
+                try self.register_manager.allocRegs(2, .{ inst, inst }, abi.RegisterClass.gp);
+            const dst_locks = self.register_manager.lockRegsAssumeUnused(2, dst_regs);
+            defer for (dst_locks) |lock| self.register_manager.unlockReg(lock);
+
+            if (src_mcv.isMemory()) {
+                try self.asmRegisterMemory(
+                    .{ ._, .movbe },
+                    dst_regs[0],
+                    src_mcv.address().offset(8).deref().mem(.qword),
+                );
+                try self.asmRegisterMemory(.{ ._, .movbe }, dst_regs[1], src_mcv.mem(.qword));
+            } else for (dst_regs, src_mcv.register_pair) |dst_reg, src_reg| {
+                try self.asmRegisterRegister(.{ ._, .mov }, dst_reg.to64(), src_reg.to64());
+                try self.asmRegister(.{ ._, .bswap }, dst_reg.to64());
+            }
+            return .{ .register_pair = dst_regs };
+        },
     }
 
     if (src_mcv.isRegister()) {
@@ -5050,10 +5155,10 @@ fn byteSwap(self: *Self, inst: Air.Inst.Index, src_ty: Type, src_mcv: MCValue, m
             defer self.register_manager.unlockReg(dst_lock);
 
             try self.genSetReg(dst_mcv.register, src_ty, src_mcv);
-            switch (src_bits) {
+            switch (abi_size) {
                 else => unreachable,
-                16 => try self.genBinOpMir(.{ ._l, .ro }, src_ty, dst_mcv, .{ .immediate = 8 }),
-                32, 64 => try self.genUnOpMir(.{ ._, .bswap }, src_ty, dst_mcv),
+                2 => try self.genBinOpMir(.{ ._l, .ro }, src_ty, dst_mcv, .{ .immediate = 8 }),
+                3...8 => try self.genUnOpMir(.{ ._, .bswap }, src_ty, dst_mcv),
             }
         } else try self.genBinOpMir(.{ ._, .movbe }, src_ty, dst_mcv, src_mcv);
         return dst_mcv;
@@ -5073,18 +5178,19 @@ fn airByteSwap(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
 
     const src_ty = self.typeOf(ty_op.operand);
+    const abi_size: u32 = @intCast(src_ty.abiSize(mod));
+    const bit_size: u32 = @intCast(src_ty.bitSize(mod));
     const src_mcv = try self.resolveInst(ty_op.operand);
 
-    const dst_mcv = try self.byteSwap(inst, src_ty, src_mcv, true);
-    switch (self.regExtraBits(src_ty)) {
-        0 => {},
-        else => |extra| try self.genBinOpMir(
-            if (src_ty.isSignedInt(mod)) .{ ._r, .sa } else .{ ._r, .sh },
-            src_ty,
-            dst_mcv,
-            .{ .immediate = extra },
-        ),
-    }
+    const dst_mcv = try self.genByteSwap(inst, src_ty, src_mcv, true);
+
+    const extra_bits = abi_size * 8 - bit_size;
+    const signedness: std.builtin.Signedness =
+        if (src_ty.isAbiInt(mod)) src_ty.intInfo(mod).signedness else .unsigned;
+    if (extra_bits > 0) try self.genShiftBinOpMir(switch (signedness) {
+        .signed => .{ ._r, .sa },
+        .unsigned => .{ ._r, .sh },
+    }, src_ty, dst_mcv, .{ .immediate = extra_bits });
 
     return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
@@ -5094,37 +5200,43 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
 
     const src_ty = self.typeOf(ty_op.operand);
-    const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
+    const abi_size: u32 = @intCast(src_ty.abiSize(mod));
+    const bit_size: u32 = @intCast(src_ty.bitSize(mod));
     const src_mcv = try self.resolveInst(ty_op.operand);
 
-    const dst_mcv = try self.byteSwap(inst, src_ty, src_mcv, false);
-    const dst_reg = dst_mcv.register;
-    const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
-    defer self.register_manager.unlockReg(dst_lock);
+    const dst_mcv = try self.genByteSwap(inst, src_ty, src_mcv, false);
+    const dst_locks: [2]?RegisterLock = switch (dst_mcv) {
+        .register => |dst_reg| .{ self.register_manager.lockReg(dst_reg), null },
+        .register_pair => |dst_regs| self.register_manager.lockRegs(2, dst_regs),
+        else => unreachable,
+    };
+    defer for (dst_locks) |dst_lock| if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
     const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
     const tmp_lock = self.register_manager.lockReg(tmp_reg);
     defer if (tmp_lock) |lock| self.register_manager.unlockReg(lock);
 
-    {
-        const dst = registerAlias(dst_reg, src_abi_size);
-        const tmp = registerAlias(tmp_reg, src_abi_size);
-        const imm = if (src_abi_size > 4)
-            try self.register_manager.allocReg(null, abi.RegisterClass.gp)
-        else
-            undefined;
+    const limb_abi_size: u32 = @min(abi_size, 8);
+    const tmp = registerAlias(tmp_reg, limb_abi_size);
+    const imm = if (limb_abi_size > 4)
+        try self.register_manager.allocReg(null, abi.RegisterClass.gp)
+    else
+        undefined;
 
-        const mask = @as(u64, math.maxInt(u64)) >> @intCast(64 - src_abi_size * 8);
-        const imm_0000_1111 = Immediate.u(mask / 0b0001_0001);
-        const imm_00_11 = Immediate.u(mask / 0b01_01);
-        const imm_0_1 = Immediate.u(mask / 0b1_1);
+    const mask = @as(u64, math.maxInt(u64)) >> @intCast(64 - limb_abi_size * 8);
+    const imm_0000_1111 = Immediate.u(mask / 0b0001_0001);
+    const imm_00_11 = Immediate.u(mask / 0b01_01);
+    const imm_0_1 = Immediate.u(mask / 0b1_1);
+
+    for (dst_mcv.getRegs()) |dst_reg| {
+        const dst = registerAlias(dst_reg, limb_abi_size);
 
         // dst = temp1 = bswap(operand)
         try self.asmRegisterRegister(.{ ._, .mov }, tmp, dst);
         // tmp = temp1
         try self.asmRegisterImmediate(.{ ._r, .sh }, dst, Immediate.u(4));
         // dst = temp1 >> 4
-        if (src_abi_size > 4) {
+        if (limb_abi_size > 4) {
             try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_0000_1111);
             try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
             try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
@@ -5142,7 +5254,7 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
         // tmp = temp2
         try self.asmRegisterImmediate(.{ ._r, .sh }, dst, Immediate.u(2));
         // dst = temp2 >> 2
-        if (src_abi_size > 4) {
+        if (limb_abi_size > 4) {
             try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_00_11);
             try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
             try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
@@ -5154,7 +5266,7 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
         // dst = (temp2 >> 2) & 0x33...33
         try self.asmRegisterMemory(
             .{ ._, .lea },
-            if (src_abi_size > 4) tmp.to64() else tmp.to32(),
+            if (limb_abi_size > 4) tmp.to64() else tmp.to32(),
             Memory.sib(.qword, .{
                 .base = .{ .reg = dst.to64() },
                 .scale_index = .{ .index = tmp.to64(), .scale = 1 << 2 },
@@ -5165,7 +5277,7 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
         // dst = temp3
         try self.asmRegisterImmediate(.{ ._r, .sh }, tmp, Immediate.u(1));
         // tmp = temp3 >> 1
-        if (src_abi_size > 4) {
+        if (limb_abi_size > 4) {
             try self.asmRegisterImmediate(.{ ._, .mov }, imm, imm_0_1);
             try self.asmRegisterRegister(.{ ._, .@"and" }, dst, imm);
             try self.asmRegisterRegister(.{ ._, .@"and" }, tmp, imm);
@@ -5177,7 +5289,7 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
         // tmp = (temp3 >> 1) & 0x55...55
         try self.asmRegisterMemory(
             .{ ._, .lea },
-            if (src_abi_size > 4) dst.to64() else dst.to32(),
+            if (limb_abi_size > 4) dst.to64() else dst.to32(),
             Memory.sib(.qword, .{
                 .base = .{ .reg = tmp.to64() },
                 .scale_index = .{ .index = dst.to64(), .scale = 1 << 1 },
@@ -5186,15 +5298,13 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
         // dst = ((temp3 >> 1) & 0x55...55) + ((temp3 & 0x55...55) << 1)
     }
 
-    switch (self.regExtraBits(src_ty)) {
-        0 => {},
-        else => |extra| try self.genBinOpMir(
-            if (src_ty.isSignedInt(mod)) .{ ._r, .sa } else .{ ._r, .sh },
-            src_ty,
-            dst_mcv,
-            .{ .immediate = extra },
-        ),
-    }
+    const extra_bits = abi_size * 8 - bit_size;
+    const signedness: std.builtin.Signedness =
+        if (src_ty.isAbiInt(mod)) src_ty.intInfo(mod).signedness else .unsigned;
+    if (extra_bits > 0) try self.genShiftBinOpMir(switch (signedness) {
+        .signed => .{ ._r, .sa },
+        .unsigned => .{ ._r, .sh },
+    }, src_ty, dst_mcv, .{ .immediate = extra_bits });
 
     return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
@@ -10861,29 +10971,41 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             },
             else => return self.fail("invalid constraint: '{s}'", .{constraint}),
         };
+        const is_early_clobber = constraint[1] == '&';
+        const rest = constraint[@as(usize, 1) + @intFromBool(is_early_clobber) ..];
         const arg_mcv: MCValue = arg_mcv: {
-            const arg_maybe_reg: ?Register = if (mem.eql(u8, constraint[1..], "r"))
-                self.register_manager.tryAllocReg(maybe_inst, self.regClassForType(ty)) orelse
-                    return self.fail("ran out of registers lowering inline asm", .{})
-            else if (mem.eql(u8, constraint[1..], "m"))
+            const arg_maybe_reg: ?Register = if (mem.eql(u8, rest, "r") or
+                mem.eql(u8, rest, "f") or mem.eql(u8, rest, "x"))
+                registerAlias(
+                    self.register_manager.tryAllocReg(maybe_inst, switch (rest[0]) {
+                        'r' => abi.RegisterClass.gp,
+                        'f' => abi.RegisterClass.x87,
+                        'x' => abi.RegisterClass.sse,
+                        else => unreachable,
+                    }) orelse return self.fail("ran out of registers lowering inline asm", .{}),
+                    @intCast(ty.abiSize(mod)),
+                )
+            else if (mem.eql(u8, rest, "m"))
                 if (output != .none) null else return self.fail(
                     "memory constraint unsupported for asm result: '{s}'",
                     .{constraint},
                 )
-            else if (mem.eql(u8, constraint[1..], "g") or
-                mem.eql(u8, constraint[1..], "rm") or mem.eql(u8, constraint[1..], "mr") or
-                mem.eql(u8, constraint[1..], "r,m") or mem.eql(u8, constraint[1..], "m,r"))
-                self.register_manager.tryAllocReg(maybe_inst, self.regClassForType(ty)) orelse
+            else if (mem.eql(u8, rest, "g") or
+                mem.eql(u8, rest, "rm") or mem.eql(u8, rest, "mr") or
+                mem.eql(u8, rest, "r,m") or mem.eql(u8, rest, "m,r"))
+                self.register_manager.tryAllocReg(maybe_inst, abi.RegisterClass.gp) orelse
                     if (output != .none)
                     null
                 else
                     return self.fail("ran out of registers lowering inline asm", .{})
-            else if (mem.startsWith(u8, constraint[1..], "{") and mem.endsWith(u8, constraint[1..], "}"))
-                parseRegName(constraint[1 + "{".len .. constraint.len - "}".len]) orelse
+            else if (mem.startsWith(u8, rest, "{") and mem.endsWith(u8, rest, "}"))
+                parseRegName(rest["{".len .. rest.len - "}".len]) orelse
                     return self.fail("invalid register constraint: '{s}'", .{constraint})
-            else if (constraint.len == 2 and std.ascii.isDigit(constraint[1])) {
-                const index = std.fmt.charToDigit(constraint[0], 10) catch unreachable;
-                if (index >= args.items.len) return self.fail("constraint out of bounds: '{s}'", .{constraint});
+            else if (rest.len == 1 and std.ascii.isDigit(rest[0])) {
+                const index = std.fmt.charToDigit(rest[0], 10) catch unreachable;
+                if (index >= args.items.len) return self.fail("constraint out of bounds: '{s}'", .{
+                    constraint,
+                });
                 break :arg_mcv args.items[index];
             } else return self.fail("invalid constraint: '{s}'", .{constraint});
             break :arg_mcv if (arg_maybe_reg) |reg| .{ .register = reg } else arg: {
@@ -10917,10 +11039,29 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
 
         const ty = self.typeOf(input);
         const input_mcv = try self.resolveInst(input);
-        const arg_mcv: MCValue = if (mem.eql(u8, constraint, "r")) switch (input_mcv) {
-            .register => input_mcv,
-            else => .{ .register = try self.copyToTmpRegister(ty, input_mcv) },
-        } else if (mem.eql(u8, constraint, "m")) arg: {
+        const arg_mcv: MCValue = if (mem.eql(u8, constraint, "r") or
+            mem.eql(u8, constraint, "f") or mem.eql(u8, constraint, "x"))
+        arg: {
+            const rc = switch (constraint[0]) {
+                'r' => abi.RegisterClass.gp,
+                'f' => abi.RegisterClass.x87,
+                'x' => abi.RegisterClass.sse,
+                else => unreachable,
+            };
+            if (input_mcv.isRegister() and
+                rc.isSet(RegisterManager.indexOfRegIntoTracked(input_mcv.getReg().?).?))
+                break :arg input_mcv;
+            const reg = try self.register_manager.allocReg(null, rc);
+            try self.genSetReg(reg, ty, input_mcv);
+            break :arg .{ .register = registerAlias(reg, @intCast(ty.abiSize(mod))) };
+        } else if (mem.eql(u8, constraint, "i") or mem.eql(u8, constraint, "n"))
+            switch (input_mcv) {
+                .immediate => |imm| .{ .immediate = imm },
+                else => return self.fail("immediate operand requires comptime value: '{s}'", .{
+                    constraint,
+                }),
+            }
+        else if (mem.eql(u8, constraint, "m")) arg: {
             switch (input_mcv) {
                 .memory => |addr| if (math.cast(i32, @as(i64, @bitCast(addr)))) |_|
                     break :arg input_mcv,
@@ -11144,6 +11285,10 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
                     arg_map.get(op_str["%[".len .. colon orelse op_str.len - "]".len]) orelse
                         return self.fail("no matching constraint: '{s}'", .{op_str})
                 ]) {
+                    .immediate => |imm| if (mem.eql(u8, modifier, "") or mem.eql(u8, modifier, "c"))
+                        .{ .imm = Immediate.u(imm) }
+                    else
+                        return self.fail("invalid modifier: '{s}'", .{modifier}),
                     .register => |reg| if (mem.eql(u8, modifier, ""))
                         .{ .reg = reg }
                     else
@@ -11272,6 +11417,13 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
                     .none => self.asmRegisterImmediate(mnem_fixed_tag, reg1, imm0),
                     .reg => |reg2| switch (ops[3]) {
                         .none => self.asmRegisterRegisterImmediate(mnem_fixed_tag, reg2, reg1, imm0),
+                        .reg => |reg3| self.asmRegisterRegisterRegisterImmediate(
+                            mnem_fixed_tag,
+                            reg3,
+                            reg2,
+                            reg1,
+                            imm0,
+                        ),
                         else => error.InvalidInstruction,
                     },
                     .mem => |mem2| switch (ops[3]) {
src/arch/x86_64/Encoding.zig
@@ -269,6 +269,9 @@ pub const Mnemonic = enum {
     pcmpeqb, pcmpeqd, pcmpeqw,
     pcmpgtb, pcmpgtd, pcmpgtw,
     pmulhw, pmullw,
+    pslld, psllq, psllw,
+    psrad, psraw,
+    psrld, psrlq, psrlw,
     psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw,
     // SSE
     addps, addss,
@@ -309,8 +312,8 @@ pub const Mnemonic = enum {
     movupd,
     mulpd, mulsd,
     orpd,
-    pshufhw, pshuflw,
-    psrld, psrlq, psrlw,
+    pshufd, pshufhw, pshuflw,
+    pslldq, psrldq,
     punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
     punpcklbw, punpckldq, punpcklqdq, punpcklwd,
     shufpd,
@@ -321,7 +324,7 @@ pub const Mnemonic = enum {
     // SSE3
     movddup, movshdup, movsldup,
     // SSSE3
-    pabsb, pabsd, pabsw,
+    pabsb, pabsd, pabsw, palignr,
     // SSE4.1
     blendpd, blendps, blendvpd, blendvps,
     extractps,
@@ -335,8 +338,15 @@ pub const Mnemonic = enum {
     roundpd, roundps, roundsd, roundss,
     // SSE4.2
     pcmpgtq,
+    // PCLMUL
+    pclmulqdq,
+    // AES
+    aesdec, aesdeclast, aesenc, aesenclast, aesimc, aeskeygenassist,
+    // SHA
+    sha256msg1, sha256msg2, sha256rnds2,
     // AVX
     vaddpd, vaddps, vaddsd, vaddss,
+    vaesdec, vaesdeclast, vaesenc, vaesenclast, vaesimc, vaeskeygenassist,
     vandnpd, vandnps, vandpd, vandps,
     vblendpd, vblendps, vblendvpd, vblendvps,
     vbroadcastf128, vbroadcastsd, vbroadcastss,
@@ -366,7 +376,7 @@ pub const Mnemonic = enum {
     vpabsb, vpabsd, vpabsw,
     vpackssdw, vpacksswb, vpackusdw, vpackuswb,
     vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
-    vpand, vpandn,
+    vpalignr, vpand, vpandn, vpclmulqdq,
     vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
     vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
     vpextrb, vpextrd, vpextrq, vpextrw,
@@ -376,8 +386,10 @@ pub const Mnemonic = enum {
     vpmovmskb,
     vpmulhw, vpmulld, vpmullw,
     vpor,
-    vpshufhw, vpshuflw,
-    vpsrld, vpsrlq, vpsrlw,
+    vpshufd, vpshufhw, vpshuflw,
+    vpslld, vpslldq, vpsllq, vpsllw,
+    vpsrad, vpsraq, vpsraw,
+    vpsrld, vpsrldq, vpsrlq, vpsrlw,
     vpsubb, vpsubd, vpsubq, vpsubsb, vpsubsw, vpsubusb, vpsubusw, vpsubw,
     vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
     vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd,
@@ -753,6 +765,8 @@ pub const Mode = enum {
 
 pub const Feature = enum {
     none,
+    aes,
+    @"aes avx",
     avx,
     avx2,
     bmi,
@@ -760,6 +774,8 @@ pub const Feature = enum {
     fma,
     lzcnt,
     movbe,
+    pclmul,
+    @"pclmul avx",
     popcnt,
     sse,
     sse2,
@@ -767,6 +783,9 @@ pub const Feature = enum {
     sse4_1,
     sse4_2,
     ssse3,
+    sha,
+    vaes,
+    vpclmulqdq,
     x87,
 };
 
@@ -784,7 +803,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
 }
 
 const mnemonic_to_encodings_map = init: {
-    @setEvalBranchQuota(50_000);
+    @setEvalBranchQuota(60_000);
     const encodings = @import("encodings.zig");
     var entries = encodings.table;
     std.mem.sort(encodings.Entry, &entries, {}, struct {
src/arch/x86_64/encodings.zig
@@ -1075,10 +1075,26 @@ pub const table = [_]Entry{
 
     .{ .por, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 },
 
+    .{ .pshufd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .none, .sse2 },
+
     .{ .pshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .none, .sse2 },
 
     .{ .pshuflw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf2, 0x0f, 0x70 }, 0, .none, .sse2 },
 
+    .{ .psllw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf1 }, 0, .none, .sse2 },
+    .{ .psllw, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 6, .none, .sse2 },
+    .{ .pslld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf2 }, 0, .none, .sse2 },
+    .{ .pslld, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 6, .none, .sse2 },
+    .{ .psllq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf3 }, 0, .none, .sse2 },
+    .{ .psllq, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 6, .none, .sse2 },
+
+    .{ .pslldq, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 7, .none, .sse2 },
+
+    .{ .psraw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe1 }, 0, .none, .sse2 },
+    .{ .psraw, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 4, .none, .sse2 },
+    .{ .psrad, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe2 }, 0, .none, .sse2 },
+    .{ .psrad, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 4, .none, .sse2 },
+
     .{ .psrlw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .none, .sse2 },
     .{ .psrlw, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 2, .none, .sse2 },
     .{ .psrld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .none, .sse2 },
@@ -1086,6 +1102,8 @@ pub const table = [_]Entry{
     .{ .psrlq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .none, .sse2 },
     .{ .psrlq, .mi, &.{ .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 2, .none, .sse2 },
 
+    .{ .psrldq, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .none, .sse2 },
+
     .{ .psubb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .none, .sse2 },
     .{ .psubw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .none, .sse2 },
     .{ .psubd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfa }, 0, .none, .sse2 },
@@ -1139,13 +1157,17 @@ pub const table = [_]Entry{
     .{ .pabsw, .rm, &.{  .mm,  .mm_m64  }, &.{       0x0f, 0x38, 0x1d }, 0, .none, .ssse3 },
     .{ .pabsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x1d }, 0, .none, .ssse3 },
 
+    .{ .palignr, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .none, .ssse3 },
+
     // SSE4.1
     .{ .blendpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 },
 
     .{ .blendps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .none, .sse4_1 },
 
+    .{ .blendvpd, .rm,  &.{ .xmm, .xmm_m128        }, &.{ 0x66, 0x0f, 0x38, 0x15 }, 0, .none, .sse4_1 },
     .{ .blendvpd, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x15 }, 0, .none, .sse4_1 },
 
+    .{ .blendvps, .rm,  &.{ .xmm, .xmm_m128        }, &.{ 0x66, 0x0f, 0x38, 0x14 }, 0, .none, .sse4_1 },
     .{ .blendvps, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x14 }, 0, .none, .sse4_1 },
 
     .{ .extractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .none, .sse4_1 },
@@ -1193,6 +1215,30 @@ pub const table = [_]Entry{
     // SSE4.2
     .{ .pcmpgtq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .none, .sse4_2 },
 
+    // PCLMUL
+    .{ .pclmulqdq, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .none, .pclmul },
+
+    // AES
+    .{ .aesdec, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xde }, 0, .none, .aes },
+
+    .{ .aesdeclast, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdf }, 0, .none, .aes },
+
+    .{ .aesenc, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdc }, 0, .none, .aes },
+
+    .{ .aesenclast, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdd }, 0, .none, .aes },
+
+    .{ .aesimc, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdb }, 0, .none, .aes },
+
+    .{ .aeskeygenassist, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0xdf }, 0, .none, .aes },
+
+    // SHA
+    .{ .sha256msg1, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x38, 0xcc }, 0, .none, .sha },
+
+    .{ .sha256msg2, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x38, 0xcd }, 0, .none, .sha },
+
+    .{ .sha256rnds2, .rm,  &.{ .xmm, .xmm_m128        }, &.{ 0x0f, 0x38, 0xcb }, 0, .none, .sha },
+    .{ .sha256rnds2, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x0f, 0x38, 0xcb }, 0, .none, .sha },
+
     // AVX
     .{ .vaddpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_128_wig, .avx },
     .{ .vaddpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_256_wig, .avx },
@@ -1204,6 +1250,18 @@ pub const table = [_]Entry{
 
     .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
 
+    .{ .vaesdec, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xde }, 0, .vex_128_wig, .@"aes avx" },
+
+    .{ .vaesdeclast, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdf }, 0, .vex_128_wig, .@"aes avx" },
+
+    .{ .vaesenc, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdc }, 0, .vex_128_wig, .@"aes avx" },
+
+    .{ .vaesenclast, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdd }, 0, .vex_128_wig, .@"aes avx" },
+
+    .{ .vaesimc, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xdb }, 0, .vex_128_wig, .@"aes avx" },
+
+    .{ .vaeskeygenassist, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0xdf }, 0, .vex_128_wig, .@"aes avx" },
+
     .{ .vandnpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .vex_128_wig, .avx },
     .{ .vandnpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x55 }, 0, .vex_256_wig, .avx },
 
@@ -1436,10 +1494,14 @@ pub const table = [_]Entry{
     .{ .vpaddusb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdc }, 0, .vex_128_wig, .avx },
     .{ .vpaddusw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdd }, 0, .vex_128_wig, .avx },
 
+    .{ .vpalignr, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .vex_128_wig, .avx },
+
     .{ .vpand, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_128_wig, .avx },
 
     .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx },
 
+    .{ .vpclmulqdq, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .vex_128_wig, .@"pclmul avx" },
+
     .{ .vpcmpeqb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_128_wig, .avx },
     .{ .vpcmpeqw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_128_wig, .avx },
     .{ .vpcmpeqd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x76 }, 0, .vex_128_wig, .avx },
@@ -1494,6 +1556,26 @@ pub const table = [_]Entry{
 
     .{ .vpor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },
 
+    .{ .vpshufd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpshuflw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf2, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpsllw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf1 }, 0, .vex_128_wig, .avx },
+    .{ .vpsllw, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 6, .vex_128_wig, .avx },
+    .{ .vpslld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf2 }, 0, .vex_128_wig, .avx },
+    .{ .vpslld, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 6, .vex_128_wig, .avx },
+    .{ .vpsllq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf3 }, 0, .vex_128_wig, .avx },
+    .{ .vpsllq, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 6, .vex_128_wig, .avx },
+
+    .{ .vpslldq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 7, .vex_128_wig, .avx },
+
+    .{ .vpsraw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe1 }, 0, .vex_128_wig, .avx },
+    .{ .vpsraw, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 4, .vex_128_wig, .avx },
+    .{ .vpsrad, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe2 }, 0, .vex_128_wig, .avx },
+    .{ .vpsrad, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 4, .vex_128_wig, .avx },
+
     .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx },
     .{ .vpsrlw, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_128_wig, .avx },
     .{ .vpsrld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_128_wig, .avx },
@@ -1501,6 +1583,8 @@ pub const table = [_]Entry{
     .{ .vpsrlq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_128_wig, .avx },
     .{ .vpsrlq, .vmi, &.{ .xmm, .xmm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_128_wig, .avx },
 
+    .{ .vpsrldq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_128_wig, .avx },
+
     .{ .vpsubb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_128_wig, .avx },
     .{ .vpsubw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_128_wig, .avx },
     .{ .vpsubd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfa }, 0, .vex_128_wig, .avx },
@@ -1597,6 +1681,18 @@ pub const table = [_]Entry{
     .{ .vfmadd213ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_lig_w0, .fma },
     .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w0, .fma },
 
+    // VPCLMULQDQ
+    .{ .vpclmulqdq, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .vex_256_wig, .vpclmulqdq },
+
+    // VAES
+    .{ .vaesdec, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xde }, 0, .vex_256_wig, .vaes },
+
+    .{ .vaesdeclast, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xdf }, 0, .vex_256_wig, .vaes },
+
+    .{ .vaesenc, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xdc }, 0, .vex_256_wig, .vaes },
+
+    .{ .vaesenclast, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xdd }, 0, .vex_256_wig, .vaes },
+
     // AVX2
     .{ .vbroadcastss, .rm, &.{ .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx2 },
     .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
@@ -1624,6 +1720,8 @@ pub const table = [_]Entry{
     .{ .vpaddusb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdc }, 0, .vex_256_wig, .avx2 },
     .{ .vpaddusw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdd }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpalignr, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .vex_256_wig, .avx2 },
+
     .{ .vpand, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_256_wig, .avx2 },
 
     .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 },
@@ -1669,6 +1767,26 @@ pub const table = [_]Entry{
 
     .{ .vpor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpshufd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpshufhw, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpshuflw, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0xf2, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpsllw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf1 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsllw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 6, .vex_256_wig, .avx2 },
+    .{ .vpslld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf2 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpslld, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 6, .vex_256_wig, .avx2 },
+    .{ .vpsllq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf3 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsllq, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 6, .vex_256_wig, .avx2 },
+
+    .{ .vpslldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 7, .vex_256_wig, .avx2 },
+
+    .{ .vpsraw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe1 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsraw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 4, .vex_256_wig, .avx2 },
+    .{ .vpsrad, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe2 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpsrad, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 4, .vex_256_wig, .avx2 },
+
     .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 },
     .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 },
     .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 },
@@ -1676,6 +1794,8 @@ pub const table = [_]Entry{
     .{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 },
     .{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 },
 
+    .{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_128_wig, .avx2 },
+
     .{ .vpsubb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_256_wig, .avx2 },
     .{ .vpsubw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_256_wig, .avx2 },
     .{ .vpsubd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfa }, 0, .vex_256_wig, .avx2 },
src/arch/x86_64/Mir.zig
@@ -494,6 +494,12 @@ pub const Inst = struct {
         mull,
         /// Multiply packed signed integers and store high result
         mulh,
+        /// Shift packed data left logical
+        sll,
+        /// Shift packed data right arithmetic
+        sra,
+        /// Shift packed data right logical
+        srl,
         /// Subtract packed signed integers with signed saturation
         subs,
         /// Subtract packed unsigned integers with unsigned saturation
@@ -592,15 +598,13 @@ pub const Inst = struct {
         movdqu,
         /// Packed interleave shuffle of quadruplets of single-precision floating-point values
         /// Packed interleave shuffle of pairs of double-precision floating-point values
+        /// Shuffle packed doublewords
+        /// Shuffle packed words
         shuf,
         /// Shuffle packed high words
         shufh,
         /// Shuffle packed low words
         shufl,
-        /// Shift packed data right logical
-        /// Shift packed data right logical
-        /// Shift packed data right logical
-        srl,
         /// Unpack high data
         unpckhbw,
         /// Unpack high data
@@ -625,6 +629,9 @@ pub const Inst = struct {
         /// Replicate single floating-point values
         movsldup,
 
+        /// Packed align right
+        alignr,
+
         /// Pack with unsigned saturation
         ackusd,
         /// Blend packed single-precision floating-point values
@@ -648,6 +655,29 @@ pub const Inst = struct {
         /// Round scalar double-precision floating-point value
         round,
 
+        /// Carry-less multiplication quadword
+        clmulq,
+
+        /// Perform one round of an AES decryption flow
+        aesdec,
+        /// Perform last round of an AES decryption flow
+        aesdeclast,
+        /// Perform one round of an AES encryption flow
+        aesenc,
+        /// Perform last round of an AES encryption flow
+        aesenclast,
+        /// Perform the AES InvMixColumn transformation
+        aesimc,
+        /// AES round key generation assist
+        aeskeygenassist,
+
+        /// Perform an intermediate calculation for the next four SHA256 message dwords
+        sha256msg1,
+        /// Perform a final calculation for the next four SHA256 message dwords
+        sha256msg2,
+        /// Perform two rounds of SHA256 operation
+        sha256rnds2,
+
         /// Load with broadcast floating-point data
         broadcast,
 
test/behavior/bitreverse.zig
@@ -11,11 +11,11 @@ test "@bitReverse large exotic integer" {
 
 test "@bitReverse" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf) return error.SkipZigTest;
 
     try comptime testBitReverse();
     try testBitReverse();
test/behavior/byteswap.zig
@@ -4,11 +4,11 @@ const expect = std.testing.expect;
 
 test "@byteSwap integers" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf) return error.SkipZigTest;
 
     const ByteSwapIntTest = struct {
         fn run() !void {
test/behavior/popcount.zig
@@ -14,7 +14,6 @@ test "@popCount integers" {
 }
 
 test "@popCount 128bit integer" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO