Commit 509be7cf1f

Jacob Young <jacobly0@users.noreply.github.com>
2023-11-04 04:18:21
x86_64: fix std test failures
1 parent f6de3ec
lib/std/atomic/Atomic.zig
@@ -467,8 +467,6 @@ test "Atomic.fetchSub" {
 }
 
 test "Atomic.fetchMin" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     inline for (atomicIntTypes()) |Int| {
         inline for (atomic_rmw_orderings) |ordering| {
             var x = Atomic(Int).init(5);
lib/std/atomic/queue.zig
@@ -175,8 +175,6 @@ const puts_per_thread = 500;
 const put_thread_count = 3;
 
 test "std.atomic.Queue" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var plenty_of_memory = try std.heap.page_allocator.alloc(u8, 300 * 1024);
     defer std.heap.page_allocator.free(plenty_of_memory);
 
lib/std/compress/zstandard.zig
@@ -264,8 +264,6 @@ fn testReader(data: []const u8, comptime expected: []const u8) !void {
 }
 
 test "zstandard decompression" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const uncompressed = @embedFile("testdata/rfc8478.txt");
     const compressed3 = @embedFile("testdata/rfc8478.txt.zst.3");
     const compressed19 = @embedFile("testdata/rfc8478.txt.zst.19");
lib/std/crypto/25519/ed25519.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const builtin = @import("builtin");
 const crypto = std.crypto;
 const debug = std.debug;
 const fmt = std.fmt;
@@ -276,8 +275,8 @@ pub const Ed25519 = struct {
         pub fn fromSecretKey(secret_key: SecretKey) (NonCanonicalError || EncodingError || IdentityElementError)!KeyPair {
             // It is critical for EdDSA to use the correct public key.
             // In order to enforce this, a SecretKey implicitly includes a copy of the public key.
-            // In Debug mode, we can still afford checking that the public key is correct for extra safety.
-            if (builtin.mode == .Debug) {
+            // With runtime safety, we can still afford checking that the public key is correct.
+            if (std.debug.runtime_safety) {
                 const pk_p = try Curve.fromBytes(secret_key.publicKeyBytes());
                 const recomputed_kp = try create(secret_key.seed());
                 debug.assert(mem.eql(u8, &recomputed_kp.public_key.toBytes(), &pk_p.toBytes()));
@@ -493,8 +492,6 @@ test "ed25519 key pair creation" {
 }
 
 test "ed25519 signature" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var seed: [32]u8 = undefined;
     _ = try fmt.hexToBytes(seed[0..], "8052030376d47112be7f73ed7a019293dd12ad910b654455798b4667d73de166");
     const key_pair = try Ed25519.KeyPair.create(seed);
@@ -507,8 +504,6 @@ test "ed25519 signature" {
 }
 
 test "ed25519 batch verification" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var i: usize = 0;
     while (i < 100) : (i += 1) {
         const key_pair = try Ed25519.KeyPair.create(null);
@@ -538,8 +533,6 @@ test "ed25519 batch verification" {
 }
 
 test "ed25519 test vectors" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const Vec = struct {
         msg_hex: *const [64:0]u8,
         public_key_hex: *const [64:0]u8,
@@ -642,8 +635,6 @@ test "ed25519 test vectors" {
 }
 
 test "ed25519 with blind keys" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const BlindKeyPair = Ed25519.key_blinding.BlindKeyPair;
 
     // Create a standard Ed25519 key pair
@@ -667,8 +658,6 @@ test "ed25519 with blind keys" {
 }
 
 test "ed25519 signatures with streaming" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const kp = try Ed25519.KeyPair.create(null);
 
     var signer = try kp.signer(null);
lib/std/crypto/Certificate/Bundle.zig
@@ -318,8 +318,6 @@ const MapContext = struct {
 test "scan for OS-provided certificates" {
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var bundle: Bundle = .{};
     defer bundle.deinit(std.testing.allocator);
 
lib/std/crypto/pcurves/p256.zig
@@ -478,7 +478,5 @@ pub const AffineCoordinates = struct {
 };
 
 test {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     _ = @import("tests/p256.zig");
 }
lib/std/crypto/aes.zig
@@ -28,8 +28,6 @@ pub const Aes128 = impl.Aes128;
 pub const Aes256 = impl.Aes256;
 
 test "ctr" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // NIST SP 800-38A pp 55-58
     const ctr = @import("modes.zig").ctr;
 
lib/std/crypto/aes_gcm.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const builtin = @import("builtin");
 const assert = std.debug.assert;
 const crypto = std.crypto;
 const debug = std.debug;
@@ -42,7 +41,7 @@ fn AesGcm(comptime Aes: anytype) type {
             mac.pad();
 
             mem.writeInt(u32, j[nonce_length..][0..4], 2, .big);
-            modes.ctr(@TypeOf(aes), aes, c, m, j, std.builtin.Endian.big);
+            modes.ctr(@TypeOf(aes), aes, c, m, j, .big);
             mac.update(c[0..m.len][0..]);
             mac.pad();
 
@@ -104,7 +103,7 @@ fn AesGcm(comptime Aes: anytype) type {
             }
 
             mem.writeInt(u32, j[nonce_length..][0..4], 2, .big);
-            modes.ctr(@TypeOf(aes), aes, m, c, j, std.builtin.Endian.big);
+            modes.ctr(@TypeOf(aes), aes, m, c, j, .big);
         }
     };
 }
@@ -113,8 +112,6 @@ const htest = @import("test.zig");
 const testing = std.testing;
 
 test "Aes256Gcm - Empty message and no associated data" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aes256Gcm.key_length]u8 = [_]u8{0x69} ** Aes256Gcm.key_length;
     const nonce: [Aes256Gcm.nonce_length]u8 = [_]u8{0x42} ** Aes256Gcm.nonce_length;
     const ad = "";
@@ -127,8 +124,6 @@ test "Aes256Gcm - Empty message and no associated data" {
 }
 
 test "Aes256Gcm - Associated data only" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aes256Gcm.key_length]u8 = [_]u8{0x69} ** Aes256Gcm.key_length;
     const nonce: [Aes256Gcm.nonce_length]u8 = [_]u8{0x42} ** Aes256Gcm.nonce_length;
     const m = "";
@@ -141,8 +136,6 @@ test "Aes256Gcm - Associated data only" {
 }
 
 test "Aes256Gcm - Message only" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aes256Gcm.key_length]u8 = [_]u8{0x69} ** Aes256Gcm.key_length;
     const nonce: [Aes256Gcm.nonce_length]u8 = [_]u8{0x42} ** Aes256Gcm.nonce_length;
     const m = "Test with message only";
@@ -160,8 +153,6 @@ test "Aes256Gcm - Message only" {
 }
 
 test "Aes256Gcm - Message and associated data" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key: [Aes256Gcm.key_length]u8 = [_]u8{0x69} ** Aes256Gcm.key_length;
     const nonce: [Aes256Gcm.nonce_length]u8 = [_]u8{0x42} ** Aes256Gcm.nonce_length;
     const m = "Test with message";
lib/std/crypto/argon2.zig
@@ -896,8 +896,6 @@ test "kdf" {
 }
 
 test "phc format hasher" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const allocator = std.testing.allocator;
     const password = "testpass";
 
@@ -913,8 +911,6 @@ test "phc format hasher" {
 }
 
 test "password hash and password verify" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const allocator = std.testing.allocator;
     const password = "testpass";
 
lib/std/crypto/bcrypt.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const builtin = @import("builtin");
 const base64 = std.base64;
 const crypto = std.crypto;
 const debug = std.debug;
@@ -754,8 +753,6 @@ pub fn strVerify(
 }
 
 test "bcrypt codec" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var salt: [salt_length]u8 = undefined;
     crypto.random.bytes(&salt);
     var salt_str: [salt_str_length]u8 = undefined;
@@ -766,8 +763,6 @@ test "bcrypt codec" {
 }
 
 test "bcrypt crypt format" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var hash_options = HashOptions{
         .params = .{ .rounds_log = 5 },
         .encoding = .crypt,
@@ -808,8 +803,6 @@ test "bcrypt crypt format" {
 }
 
 test "bcrypt phc format" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var hash_options = HashOptions{
         .params = .{ .rounds_log = 5 },
         .encoding = .phc,
lib/std/crypto/Certificate.zig
@@ -614,18 +614,18 @@ const Date = struct {
 };
 
 pub fn parseTimeDigits(text: *const [2]u8, min: u8, max: u8) !u8 {
-    const nn: @Vector(2, u16) = .{ text[0], text[1] };
-    const zero: @Vector(2, u16) = .{ '0', '0' };
-    const mm: @Vector(2, u16) = .{ 10, 1 };
-    const result = @reduce(.Add, (nn -% zero) *% mm);
+    const result = if (use_vectors) result: {
+        const nn: @Vector(2, u16) = .{ text[0], text[1] };
+        const zero: @Vector(2, u16) = .{ '0', '0' };
+        const mm: @Vector(2, u16) = .{ 10, 1 };
+        break :result @reduce(.Add, (nn -% zero) *% mm);
+    } else std.fmt.parseInt(u8, text, 10) catch return error.CertificateTimeInvalid;
     if (result < min) return error.CertificateTimeInvalid;
     if (result > max) return error.CertificateTimeInvalid;
     return @truncate(result);
 }
 
 test parseTimeDigits {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const expectEqual = std.testing.expectEqual;
     try expectEqual(@as(u8, 0), try parseTimeDigits("00", 0, 99));
     try expectEqual(@as(u8, 99), try parseTimeDigits("99", 0, 99));
@@ -638,17 +638,17 @@ test parseTimeDigits {
 }
 
 pub fn parseYear4(text: *const [4]u8) !u16 {
-    const nnnn: @Vector(4, u32) = .{ text[0], text[1], text[2], text[3] };
-    const zero: @Vector(4, u32) = .{ '0', '0', '0', '0' };
-    const mmmm: @Vector(4, u32) = .{ 1000, 100, 10, 1 };
-    const result = @reduce(.Add, (nnnn -% zero) *% mmmm);
+    const result = if (use_vectors) result: {
+        const nnnn: @Vector(4, u32) = .{ text[0], text[1], text[2], text[3] };
+        const zero: @Vector(4, u32) = .{ '0', '0', '0', '0' };
+        const mmmm: @Vector(4, u32) = .{ 1000, 100, 10, 1 };
+        break :result @reduce(.Add, (nnnn -% zero) *% mmmm);
+    } else std.fmt.parseInt(u16, text, 10) catch return error.CertificateTimeInvalid;
     if (result > 9999) return error.CertificateTimeInvalid;
     return @truncate(result);
 }
 
 test parseYear4 {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const expectEqual = std.testing.expectEqual;
     try expectEqual(@as(u16, 0), try parseYear4("0000"));
     try expectEqual(@as(u16, 9999), try parseYear4("9999"));
@@ -1124,4 +1124,4 @@ pub const rsa = struct {
     }
 };
 
-const builtin = @import("builtin");
+const use_vectors = @import("builtin").zig_backend != .stage2_x86_64;
lib/std/crypto/cmac.zig
@@ -1,5 +1,4 @@
 const std = @import("std");
-const builtin = @import("builtin");
 const crypto = std.crypto;
 const mem = std.mem;
 
@@ -94,8 +93,6 @@ pub fn Cmac(comptime BlockCipher: type) type {
 const testing = std.testing;
 
 test "CmacAes128 - Example 1: len = 0" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{
         0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c,
     };
@@ -109,8 +106,6 @@ test "CmacAes128 - Example 1: len = 0" {
 }
 
 test "CmacAes128 - Example 2: len = 16" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{
         0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c,
     };
@@ -126,8 +121,6 @@ test "CmacAes128 - Example 2: len = 16" {
 }
 
 test "CmacAes128 - Example 3: len = 40" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{
         0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c,
     };
@@ -145,8 +138,6 @@ test "CmacAes128 - Example 3: len = 40" {
 }
 
 test "CmacAes128 - Example 4: len = 64" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{
         0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c,
     };
lib/std/crypto/ecdsa.zig
@@ -373,7 +373,6 @@ pub fn Ecdsa(comptime Curve: type, comptime Hash: type) type {
 
 test "ECDSA - Basic operations over EcdsaP384Sha384" {
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const Scheme = EcdsaP384Sha384;
     const kp = try Scheme.KeyPair.create(null);
@@ -407,7 +406,6 @@ test "ECDSA - Basic operations over Secp256k1" {
 
 test "ECDSA - Basic operations over EcdsaP384Sha256" {
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const Scheme = Ecdsa(crypto.ecc.P384, crypto.hash.sha2.Sha256);
     const kp = try Scheme.KeyPair.create(null);
@@ -424,7 +422,6 @@ test "ECDSA - Basic operations over EcdsaP384Sha256" {
 
 test "ECDSA - Verifying a existing signature with EcdsaP384Sha256" {
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const Scheme = Ecdsa(crypto.ecc.P384, crypto.hash.sha2.Sha256);
     // zig fmt: off
@@ -469,7 +466,6 @@ const TestVector = struct {
 
 test "ECDSA - Test vectors from Project Wycheproof" {
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const vectors = [_]TestVector{
         .{ .key = "042927b10512bae3eddcfe467828128bad2903269919f7086069c8c4df6c732838c7787964eaac00e5921fb1498a60f4606766b3d9685001558d1a974e7341513e", .msg = "313233343030", .sig = "304402202ba3a8be6b94d5ec80a6d9d1190a436effe50d85a1eee859b8cc6af9bd5c2e1802204cd60b855d442f5b3c7b11eb6c4e0ae7525fe710fab9aa7c77a67f79e6fadd76", .result = .valid },
@@ -884,7 +880,6 @@ fn tvTry(vector: TestVector) !void {
 
 test "ECDSA - Sec1 encoding/decoding" {
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     const Scheme = EcdsaP384Sha384;
     const kp = try Scheme.KeyPair.create(null);
lib/std/crypto/ghash_polyval.zig
@@ -422,8 +422,6 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type {
 const htest = @import("test.zig");
 
 test "ghash" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const key = [_]u8{0x42} ** 16;
     const m = [_]u8{0x69} ** 256;
 
@@ -441,8 +439,6 @@ test "ghash" {
 }
 
 test "ghash2" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var key: [16]u8 = undefined;
     var i: usize = 0;
     while (i < key.len) : (i += 1) {
lib/std/crypto/phc_encoding.zig
@@ -1,7 +1,6 @@
 // https://github.com/P-H-C/phc-string-format
 
 const std = @import("std");
-const builtin = @import("builtin");
 const fmt = std.fmt;
 const io = std.io;
 const mem = std.mem;
@@ -264,8 +263,6 @@ fn kvSplit(str: []const u8) !struct { key: []const u8, value: []const u8 } {
 }
 
 test "phc format - encoding/decoding" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const Input = struct {
         str: []const u8,
         HashResult: type,
lib/std/crypto/sha2.zig
@@ -238,7 +238,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                         return;
                     },
                     // C backend doesn't currently support passing vectors to inline asm.
-                    .x86_64 => if (builtin.zig_backend != .stage2_c and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) {
+                    .x86_64 => if (builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) {
                         var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
                         var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
                         const s_v = @as(*[16]v4u32, @ptrCast(&s));
lib/std/fmt/parse_float.zig
@@ -83,8 +83,6 @@ test "fmt.parseFloat #11169" {
 }
 
 test "fmt.parseFloat hex.special" {
-    if (@import("builtin").zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testing.expect(math.isNan(try parseFloat(f32, "nAn")));
     try testing.expect(math.isPositiveInf(try parseFloat(f32, "iNf")));
     try testing.expect(math.isPositiveInf(try parseFloat(f32, "+Inf")));
lib/std/hash/xxhash.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 const builtin = @import("builtin");
 const mem = std.mem;
 const expectEqual = std.testing.expectEqual;
+const native_endian = builtin.cpu.arch.endian();
 
 const rotl = std.math.rotl;
 
@@ -472,7 +473,7 @@ pub const XxHash3 = struct {
     }
 
     inline fn swap(x: anytype) @TypeOf(x) {
-        return if (builtin.cpu.arch.endian() == .big) @byteSwap(x) else x;
+        return if (native_endian == .big) @byteSwap(x) else x;
     }
 
     inline fn disableAutoVectorization(x: anytype) void {
lib/std/http/Client.zig
@@ -9,6 +9,7 @@ const net = std.net;
 const Uri = std.Uri;
 const Allocator = mem.Allocator;
 const assert = std.debug.assert;
+const use_vectors = builtin.zig_backend != .stage2_x86_64;
 
 const Client = @This();
 const proto = @import("protocol.zig");
@@ -408,7 +409,7 @@ pub const Response = struct {
             else => return error.HttpHeadersInvalid,
         };
         if (first_line[8] != ' ') return error.HttpHeadersInvalid;
-        const status = @as(http.Status, @enumFromInt(parseInt3(first_line[9..12].*)));
+        const status: http.Status = @enumFromInt(parseInt3(first_line[9..12]));
         const reason = mem.trimLeft(u8, first_line[12..], " ");
 
         res.version = version;
@@ -481,20 +482,24 @@ pub const Response = struct {
     }
 
     inline fn int64(array: *const [8]u8) u64 {
-        return @as(u64, @bitCast(array.*));
+        return @bitCast(array.*);
     }
 
-    fn parseInt3(nnn: @Vector(3, u8)) u10 {
-        const zero: @Vector(3, u8) = .{ '0', '0', '0' };
-        const mmm: @Vector(3, u10) = .{ 100, 10, 1 };
-        return @reduce(.Add, @as(@Vector(3, u10), nnn -% zero) *% mmm);
+    fn parseInt3(text: *const [3]u8) u10 {
+        if (use_vectors) {
+            const nnn: @Vector(3, u8) = text.*;
+            const zero: @Vector(3, u8) = .{ '0', '0', '0' };
+            const mmm: @Vector(3, u10) = .{ 100, 10, 1 };
+            return @reduce(.Add, @as(@Vector(3, u10), nnn -% zero) *% mmm);
+        }
+        return std.fmt.parseInt(u10, text, 10) catch unreachable;
     }
 
     test parseInt3 {
         const expectEqual = testing.expectEqual;
-        try expectEqual(@as(u10, 0), parseInt3("000".*));
-        try expectEqual(@as(u10, 418), parseInt3("418".*));
-        try expectEqual(@as(u10, 999), parseInt3("999".*));
+        try expectEqual(@as(u10, 0), parseInt3("000"));
+        try expectEqual(@as(u10, 418), parseInt3("418"));
+        try expectEqual(@as(u10, 999), parseInt3("999"));
     }
 
     version: http.Version,
@@ -1588,7 +1593,8 @@ test {
 
     if (builtin.os.tag == .wasi) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx)) return error.SkipZigTest;
 
     std.testing.refAllDecls(@This());
 }
lib/std/http/protocol.zig
@@ -1,8 +1,10 @@
 const std = @import("../std.zig");
+const builtin = @import("builtin");
 const testing = std.testing;
 const mem = std.mem;
 
 const assert = std.debug.assert;
+const use_vectors = builtin.zig_backend != .stage2_x86_64;
 
 pub const State = enum {
     /// Begin header parsing states.
@@ -83,7 +85,7 @@ pub const HeadersParser = struct {
     /// first byte of content is located at `bytes[result]`.
     pub fn findHeadersEnd(r: *HeadersParser, bytes: []const u8) u32 {
         const vector_len: comptime_int = @max(std.simd.suggestVectorSize(u8) orelse 1, 8);
-        const len = @as(u32, @intCast(bytes.len));
+        const len: u32 = @intCast(bytes.len);
         var index: u32 = 0;
 
         while (true) {
@@ -175,18 +177,27 @@ pub const HeadersParser = struct {
                         continue;
                     },
                     else => {
-                        const Vector = @Vector(vector_len, u8);
-                        // const BoolVector = @Vector(vector_len, bool);
-                        const BitVector = @Vector(vector_len, u1);
-                        const SizeVector = @Vector(vector_len, u8);
-
                         const chunk = bytes[index..][0..vector_len];
-                        const v: Vector = chunk.*;
-                        const matches_r = @as(BitVector, @bitCast(v == @as(Vector, @splat('\r'))));
-                        const matches_n = @as(BitVector, @bitCast(v == @as(Vector, @splat('\n'))));
-                        const matches_or: SizeVector = matches_r | matches_n;
-
-                        const matches = @reduce(.Add, matches_or);
+                        const matches = if (use_vectors) matches: {
+                            const Vector = @Vector(vector_len, u8);
+                            // const BoolVector = @Vector(vector_len, bool);
+                            const BitVector = @Vector(vector_len, u1);
+                            const SizeVector = @Vector(vector_len, u8);
+
+                            const v: Vector = chunk.*;
+                            const matches_r: BitVector = @bitCast(v == @as(Vector, @splat('\r')));
+                            const matches_n: BitVector = @bitCast(v == @as(Vector, @splat('\n')));
+                            const matches_or: SizeVector = matches_r | matches_n;
+
+                            break :matches @reduce(.Add, matches_or);
+                        } else matches: {
+                            var matches: u8 = 0;
+                            for (chunk) |byte| switch (byte) {
+                                '\r', '\n' => matches += 1,
+                                else => {},
+                            };
+                            break :matches matches;
+                        };
                         switch (matches) {
                             0 => {},
                             1 => switch (chunk[vector_len - 1]) {
lib/std/http/Server.zig
@@ -736,8 +736,6 @@ test "HTTP server handles a chunked transfer coding request" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const native_endian = comptime builtin.cpu.arch.endian();
     if (builtin.zig_backend == .stage2_llvm and native_endian == .big) {
         // https://github.com/ziglang/zig/issues/13782
lib/std/math/big/int.zig
@@ -1318,7 +1318,7 @@ pub const Mutable = struct {
     ///
     /// `limbs_buffer` is used for temporary storage.
     /// The amount required is given by `calcPowLimbsBufferLen`.
-    pub fn pow(r: *Mutable, a: Const, b: u32, limbs_buffer: []Limb) !void {
+    pub fn pow(r: *Mutable, a: Const, b: u32, limbs_buffer: []Limb) void {
         assert(r.limbs.ptr != a.limbs.ptr); // illegal aliasing
 
         // Handle all the trivial cases first
@@ -3213,7 +3213,7 @@ pub const Managed = struct {
             var m = try Managed.initCapacity(rma.allocator, needed_limbs);
             errdefer m.deinit();
             var m_mut = m.toMutable();
-            try m_mut.pow(a.toConst(), b, limbs_buffer);
+            m_mut.pow(a.toConst(), b, limbs_buffer);
             m.setMetadata(m_mut.positive, m_mut.len);
 
             rma.deinit();
@@ -3221,7 +3221,7 @@ pub const Managed = struct {
         } else {
             try rma.ensureCapacity(needed_limbs);
             var rma_mut = rma.toMutable();
-            try rma_mut.pow(a.toConst(), b, limbs_buffer);
+            rma_mut.pow(a.toConst(), b, limbs_buffer);
             rma.setMetadata(rma_mut.positive, rma_mut.len);
         }
     }
lib/std/math/big/int_test.zig
@@ -2568,8 +2568,6 @@ test "big.int const to managed" {
 }
 
 test "big.int pow" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     {
         var a = try Managed.initSet(testing.allocator, -3);
         defer a.deinit();
@@ -2763,8 +2761,6 @@ fn popCountTest(val: *const Managed, bit_count: usize, expected: usize) !void {
 }
 
 test "big int conversion read/write twos complement" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var a = try Managed.initSet(testing.allocator, (1 << 493) - 1);
     defer a.deinit();
     var b = try Managed.initSet(testing.allocator, (1 << 493) - 1);
@@ -2863,8 +2859,6 @@ test "big int write twos complement +/- zero" {
 }
 
 test "big int conversion write twos complement with padding" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var a = try Managed.initSet(testing.allocator, 0x01_ffffffff_ffffffff_ffffffff);
     defer a.deinit();
 
lib/std/net/test.zig
@@ -60,7 +60,7 @@ test "parse and render IPv6 addresses" {
 }
 
 test "invalid but parseable IPv6 scope ids" {
-    if (builtin.os.tag != .linux or comptime !builtin.os.tag.isDarwin()) {
+    if (builtin.os.tag != .linux and comptime !builtin.os.tag.isDarwin()) {
         // Currently, resolveIp6 with alphanumerical scope IDs only works on Linux.
         // TODO Make this test pass on other operating systems.
         return error.SkipZigTest;
lib/std/os/test.zig
@@ -375,8 +375,6 @@ fn testThreadIdFn(thread_id: *Thread.Id) void {
 test "std.Thread.getCurrentId" {
     if (builtin.single_threaded) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var thread_current_id: Thread.Id = undefined;
     const thread = try Thread.spawn(.{}, testThreadIdFn, .{&thread_current_id});
     thread.join();
@@ -420,8 +418,6 @@ test "cpu count" {
 test "thread local storage" {
     if (builtin.single_threaded) return error.SkipZigTest;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const thread1 = try Thread.spawn(.{}, testTls, .{});
     const thread2 = try Thread.spawn(.{}, testTls, .{});
     try testTls();
lib/std/rand/test.zig
@@ -1,5 +1,4 @@
 const std = @import("../std.zig");
-const builtin = @import("builtin");
 const math = std.math;
 const DefaultPrng = std.rand.DefaultPrng;
 const Random = std.rand.Random;
@@ -200,8 +199,6 @@ fn testRandomIntLessThan() !void {
 }
 
 test "Random intAtMost" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     @setEvalBranchQuota(10000);
     try testRandomIntAtMost();
     try comptime testRandomIntAtMost();
@@ -242,8 +239,6 @@ fn testRandomIntAtMost() !void {
 }
 
 test "Random Biased" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var prng = DefaultPrng.init(0);
     const random = prng.random();
     // Not thoroughly checking the logic here.
@@ -452,8 +447,6 @@ test "CSPRNG" {
 }
 
 test "Random weightedIndex" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     // Make sure weightedIndex works for various integers and floats
     inline for (.{ u64, i4, f32, f64 }) |T| {
         var prng = DefaultPrng.init(0);
lib/std/Thread/Condition.zig
@@ -324,8 +324,6 @@ test "Condition - wait and signal" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_threads = 4;
 
     const MultiWait = struct {
@@ -371,8 +369,6 @@ test "Condition - signal" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_threads = 4;
 
     const SignalTest = struct {
@@ -440,8 +436,6 @@ test "Condition - multi signal" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_threads = 4;
     const num_iterations = 4;
 
@@ -504,8 +498,6 @@ test "Condition - broadcasting" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_threads = 10;
 
     const BroadcastTest = struct {
@@ -573,8 +565,6 @@ test "Condition - broadcasting - wake all threads" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     var num_runs: usize = 1;
     const num_threads = 10;
 
lib/std/Thread/Mutex.zig
@@ -289,8 +289,6 @@ test "Mutex - many contended" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_threads = 4;
     const num_increments = 1000;
 
lib/std/Thread/RwLock.zig
@@ -297,8 +297,6 @@ test "RwLock - concurrent access" {
     if (builtin.single_threaded)
         return;
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const num_writers: usize = 2;
     const num_readers: usize = 4;
     const num_writes: usize = 10000;
lib/std/Thread/Semaphore.zig
@@ -39,8 +39,6 @@ test "Thread.Semaphore" {
         return error.SkipZigTest;
     }
 
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const TestContext = struct {
         sem: *Semaphore,
         n: *i32,
lib/std/zig/tokenizer.zig
@@ -1,5 +1,4 @@
 const std = @import("../std.zig");
-const builtin = @import("builtin");
 
 pub const Token = struct {
     tag: Tag,
@@ -1450,8 +1449,6 @@ test "chars" {
 }
 
 test "invalid token characters" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testTokenize("#", &.{.invalid});
     try testTokenize("`", &.{.invalid});
     try testTokenize("'c", &.{.invalid});
@@ -1571,8 +1568,6 @@ test "pipe and then invalid" {
 }
 
 test "line comment and doc comment" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testTokenize("//", &.{});
     try testTokenize("// a / b", &.{});
     try testTokenize("// /", &.{});
@@ -1647,8 +1642,6 @@ test "range literals" {
 }
 
 test "number literals decimal" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testTokenize("0", &.{.number_literal});
     try testTokenize("1", &.{.number_literal});
     try testTokenize("2", &.{.number_literal});
@@ -1897,8 +1890,6 @@ test "invalid token with unfinished escape right before eof" {
 }
 
 test "saturating operators" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testTokenize("<<", &.{.angle_bracket_angle_bracket_left});
     try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe});
     try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal});
lib/std/base64.zig
@@ -355,8 +355,6 @@ pub const Base64DecoderWithIgnore = struct {
 };
 
 test "base64" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     @setEvalBranchQuota(8000);
     try testBase64();
     try comptime testAllApis(standard, "comptime", "Y29tcHRpbWU=");
@@ -377,8 +375,6 @@ test "base64 padding dest overflow" {
 }
 
 test "base64 url_safe_no_pad" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     @setEvalBranchQuota(8000);
     try testBase64UrlSafeNoPad();
     try comptime testAllApis(url_safe_no_pad, "comptime", "Y29tcHRpbWU");
lib/std/bit_set.zig
@@ -1638,7 +1638,6 @@ fn testStaticBitSet(comptime Set: type) !void {
 
 test "IntegerBitSet" {
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
 
     try testStaticBitSet(IntegerBitSet(0));
     try testStaticBitSet(IntegerBitSet(1));
@@ -1651,8 +1650,6 @@ test "IntegerBitSet" {
 }
 
 test "ArrayBitSet" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     inline for (.{ 0, 1, 2, 31, 32, 33, 63, 64, 65, 254, 500, 3000 }) |size| {
         try testStaticBitSet(ArrayBitSet(u8, size));
         try testStaticBitSet(ArrayBitSet(u16, size));
lib/std/math.zig
@@ -492,8 +492,6 @@ pub fn shl(comptime T: type, a: T, shift_amt: anytype) T {
 }
 
 test "shl" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .aarch64) {
         // https://github.com/ziglang/zig/issues/12012
         return error.SkipZigTest;
@@ -539,8 +537,6 @@ pub fn shr(comptime T: type, a: T, shift_amt: anytype) T {
 }
 
 test "shr" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .aarch64) {
         // https://github.com/ziglang/zig/issues/12012
         return error.SkipZigTest;
@@ -587,8 +583,6 @@ pub fn rotr(comptime T: type, x: T, r: anytype) T {
 }
 
 test "rotr" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .aarch64) {
         // https://github.com/ziglang/zig/issues/12012
         return error.SkipZigTest;
@@ -634,8 +628,6 @@ pub fn rotl(comptime T: type, x: T, r: anytype) T {
 }
 
 test "rotl" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .aarch64) {
         // https://github.com/ziglang/zig/issues/12012
         return error.SkipZigTest;
@@ -764,8 +756,6 @@ pub fn divTrunc(comptime T: type, numerator: T, denominator: T) !T {
 }
 
 test "divTrunc" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testDivTrunc();
     try comptime testDivTrunc();
 }
@@ -790,8 +780,6 @@ pub fn divFloor(comptime T: type, numerator: T, denominator: T) !T {
 }
 
 test "divFloor" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testDivFloor();
     try comptime testDivFloor();
 }
@@ -829,8 +817,6 @@ pub fn divCeil(comptime T: type, numerator: T, denominator: T) !T {
 }
 
 test "divCeil" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testDivCeil();
     try comptime testDivCeil();
 }
@@ -875,8 +861,6 @@ pub fn divExact(comptime T: type, numerator: T, denominator: T) !T {
 }
 
 test "divExact" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testDivExact();
     try comptime testDivExact();
 }
@@ -903,8 +887,6 @@ pub fn mod(comptime T: type, numerator: T, denominator: T) !T {
 }
 
 test "mod" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testMod();
     try comptime testMod();
 }
@@ -931,8 +913,6 @@ pub fn rem(comptime T: type, numerator: T, denominator: T) !T {
 }
 
 test "rem" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     try testRem();
     try comptime testRem();
 }
@@ -1285,7 +1265,8 @@ pub fn lerp(a: anytype, b: anytype, t: anytype) @TypeOf(a, b, t) {
 }
 
 test "lerp" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .fma)) return error.SkipZigTest;
 
     try testing.expectEqual(@as(f64, 75), lerp(50, 100, 0.5));
     try testing.expectEqual(@as(f32, 43.75), lerp(50, 25, 0.25));
lib/std/mem.zig
@@ -315,8 +315,6 @@ pub fn zeroes(comptime T: type) T {
 }
 
 test "zeroes" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     const C_struct = extern struct {
         x: u32,
         y: u32 align(128),
@@ -4342,8 +4340,6 @@ pub fn alignInSlice(slice: anytype, comptime new_alignment: usize) ?AlignedSlice
 }
 
 test "read/write(Var)PackedInt" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     switch (builtin.cpu.arch) {
         // This test generates too much code to execute on WASI.
         // LLVM backend fails with "too many locals: locals exceed maximum"
lib/std/once.zig
@@ -46,8 +46,6 @@ fn incr() void {
 }
 
 test "Once executes its function just once" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
-
     if (builtin.single_threaded) {
         global_once.call();
         global_once.call();
src/arch/x86_64/bits.zig
@@ -237,7 +237,7 @@ pub const Register = enum(u7) {
         return @intCast(@intFromEnum(reg) - base);
     }
 
-    pub fn bitSize(reg: Register) u64 {
+    pub fn bitSize(reg: Register) u10 {
         return switch (@intFromEnum(reg)) {
             // zig fmt: off
             @intFromEnum(Register.rax)  ... @intFromEnum(Register.r15)   => 64,
src/arch/x86_64/CodeGen.zig
@@ -388,7 +388,7 @@ pub const MCValue = union(enum) {
         };
     }
 
-    fn mem(mcv: MCValue, size: Memory.Size) Memory {
+    fn mem(mcv: MCValue, function: *Self, size: Memory.Size) !Memory {
         return switch (mcv) {
             .none,
             .unreach,
@@ -409,7 +409,6 @@ pub const MCValue = union(enum) {
             .lea_frame,
             .reserved_frame,
             .air_ref,
-            .load_symbol,
             .lea_symbol,
             => unreachable,
             .memory => |addr| if (math.cast(i32, @as(i64, @bitCast(addr)))) |small_addr| .{
@@ -433,6 +432,19 @@ pub const MCValue = union(enum) {
                     .disp = frame_addr.off,
                 } },
             },
+            .load_symbol => |sym_off| {
+                assert(sym_off.off == 0);
+                return .{
+                    .base = .{ .reloc = .{
+                        .atom_index = try function.owner.getSymbolIndex(function),
+                        .sym_index = sym_off.sym,
+                    } },
+                    .mod = .{ .rm = .{
+                        .size = size,
+                        .disp = sym_off.off,
+                    } },
+                };
+            },
         };
     }
 
@@ -722,12 +734,14 @@ const InstTracking = struct {
 
 const FrameAlloc = struct {
     abi_size: u31,
+    spill_pad: u3,
     abi_align: Alignment,
     ref_count: u16,
 
-    fn init(alloc_abi: struct { size: u64, alignment: Alignment }) FrameAlloc {
+    fn init(alloc_abi: struct { size: u64, pad: u3 = 0, alignment: Alignment }) FrameAlloc {
         return .{
             .abi_size = @intCast(alloc_abi.size),
+            .spill_pad = alloc_abi.pad,
             .abi_align = alloc_abi.alignment,
             .ref_count = 0,
         };
@@ -738,6 +752,20 @@ const FrameAlloc = struct {
             .alignment = ty.abiAlignment(mod),
         });
     }
+    fn initSpill(ty: Type, mod: *Module) FrameAlloc {
+        const abi_size = ty.abiSize(mod);
+        const spill_size = if (abi_size < 8)
+            math.ceilPowerOfTwoAssert(u64, abi_size)
+        else
+            std.mem.alignForward(u64, abi_size, 8);
+        return init(.{
+            .size = spill_size,
+            .pad = @intCast(spill_size - abi_size),
+            .alignment = ty.abiAlignment(mod).maxStrict(
+                Alignment.fromNonzeroByteUnits(@min(spill_size, 8)),
+            ),
+        });
+    }
 };
 
 const StackAllocation = struct {
@@ -1668,8 +1696,7 @@ fn gen(self: *Self) InnerError!void {
                 // The address where to store the return value for the caller is in a
                 // register which the callee is free to clobber. Therefore, we purposely
                 // spill it to stack immediately.
-                const frame_index =
-                    try self.allocFrameIndex(FrameAlloc.initType(Type.usize, mod));
+                const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(Type.usize, mod));
                 try self.genSetMem(
                     .{ .frame = frame_index },
                     0,
@@ -2434,7 +2461,7 @@ fn allocRegOrMemAdvanced(self: *Self, ty: Type, inst: ?Air.Inst.Index, reg_ok: b
         }
     }
 
-    const frame_index = try self.allocFrameIndex(FrameAlloc.initType(ty, mod));
+    const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(ty, mod));
     return .{ .load_frame = .{ .index = frame_index } };
 }
 
@@ -2445,7 +2472,10 @@ fn regClassForType(self: *Self, ty: Type) RegisterManager.RegisterBitSet {
             80 => abi.RegisterClass.x87,
             else => abi.RegisterClass.sse,
         },
-        .Vector => abi.RegisterClass.sse,
+        .Vector => switch (ty.childType(mod).toIntern()) {
+            .bool_type => abi.RegisterClass.gp,
+            else => abi.RegisterClass.sse,
+        },
         else => abi.RegisterClass.gp,
     };
 }
@@ -2699,7 +2729,7 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
                 .{ .v_ss, .cvtsd2 },
                 dst_reg,
                 dst_reg,
-                src_mcv.mem(.qword),
+                try src_mcv.mem(self, .qword),
             ) else try self.asmRegisterRegisterRegister(
                 .{ .v_ss, .cvtsd2 },
                 dst_reg,
@@ -2711,7 +2741,7 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
             ) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
                 .{ ._ss, .cvtsd2 },
                 dst_reg,
-                src_mcv.mem(.qword),
+                try src_mcv.mem(self, .qword),
             ) else try self.asmRegisterRegister(
                 .{ ._ss, .cvtsd2 },
                 dst_reg,
@@ -2798,7 +2828,7 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
                 .{ .v_sd, .cvtss2 },
                 dst_reg,
                 dst_reg,
-                src_mcv.mem(.dword),
+                try src_mcv.mem(self, .dword),
             ) else try self.asmRegisterRegisterRegister(
                 .{ .v_sd, .cvtss2 },
                 dst_reg,
@@ -2810,7 +2840,7 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
             ) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
                 .{ ._sd, .cvtss2 },
                 dst_reg,
-                src_mcv.mem(.dword),
+                try src_mcv.mem(self, .dword),
             ) else try self.asmRegisterRegister(
                 .{ ._sd, .cvtss2 },
                 dst_reg,
@@ -2851,8 +2881,8 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
         };
 
         const dst_mcv = if (dst_int_info.bits <= src_storage_bits and
-            std.math.divCeil(u16, dst_int_info.bits, 64) catch unreachable ==
-            std.math.divCeil(u32, src_storage_bits, 64) catch unreachable and
+            math.divCeil(u16, dst_int_info.bits, 64) catch unreachable ==
+            math.divCeil(u32, src_storage_bits, 64) catch unreachable and
             self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) src_mcv else dst: {
             const dst_mcv = try self.allocRegOrMem(inst, true);
             try self.genCopy(min_ty, dst_mcv, src_mcv);
@@ -2869,22 +2899,28 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
             break :result .{ .register = registerAlias(dst_mcv.getReg().?, abi_size) };
         }
 
-        const src_limbs_len = std.math.divCeil(u16, src_int_info.bits, 64) catch unreachable;
-        const dst_limbs_len = std.math.divCeil(u16, dst_int_info.bits, 64) catch unreachable;
+        const src_limbs_len = math.divCeil(u16, src_int_info.bits, 64) catch unreachable;
+        const dst_limbs_len = math.divCeil(u16, dst_int_info.bits, 64) catch unreachable;
 
-        const high_mcv = dst_mcv.address().offset((src_limbs_len - 1) * 8).deref();
-        const high_reg = try self.copyToTmpRegister(switch (src_int_info.signedness) {
-            .signed => Type.isize,
-            .unsigned => Type.usize,
-        }, high_mcv);
+        const high_mcv: MCValue = if (dst_mcv.isMemory())
+            dst_mcv.address().offset((src_limbs_len - 1) * 8).deref()
+        else
+            .{ .register = dst_mcv.register_pair[1] };
+        const high_reg = if (high_mcv.isRegister())
+            high_mcv.getReg().?
+        else
+            try self.copyToTmpRegister(switch (src_int_info.signedness) {
+                .signed => Type.isize,
+                .unsigned => Type.usize,
+            }, high_mcv);
         const high_lock = self.register_manager.lockRegAssumeUnused(high_reg);
         defer self.register_manager.unlockReg(high_lock);
 
         const high_bits = src_int_info.bits % 64;
         if (high_bits > 0) {
-            const high_ty = try mod.intType(extend, high_bits);
-            try self.truncateRegister(high_ty, high_reg);
-            try self.genCopy(Type.usize, high_mcv, .{ .register = high_reg });
+            try self.truncateRegister(src_ty, high_reg);
+            const high_ty = if (dst_int_info.bits >= 64) Type.usize else dst_ty;
+            try self.genCopy(high_ty, high_mcv, .{ .register = high_reg });
         }
 
         if (dst_limbs_len > src_limbs_len) try self.genInlineMemset(
@@ -2995,14 +3031,14 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
                     .{ .vp_, .@"and" },
                     dst_reg,
                     dst_reg,
-                    splat_addr_mcv.deref().mem(Memory.Size.fromSize(splat_abi_size)),
+                    try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
                 );
                 try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg);
             } else {
                 try self.asmRegisterMemory(
                     .{ .p_, .@"and" },
                     dst_reg,
-                    splat_addr_mcv.deref().mem(Memory.Size.fromSize(splat_abi_size)),
+                    try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
                 );
                 try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg);
             }
@@ -3048,7 +3084,7 @@ fn airSlice(self: *Self, inst: Air.Inst.Index) !void {
     const len = try self.resolveInst(bin_op.rhs);
     const len_ty = self.typeOf(bin_op.rhs);
 
-    const frame_index = try self.allocFrameIndex(FrameAlloc.initType(slice_ty, mod));
+    const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(slice_ty, mod));
     try self.genSetMem(.{ .frame = frame_index }, 0, ptr_ty, ptr);
     try self.genSetMem(
         .{ .frame = frame_index },
@@ -3068,8 +3104,36 @@ fn airUnOp(self: *Self, inst: Air.Inst.Index, tag: Air.Inst.Tag) !void {
 }
 
 fn airBinOp(self: *Self, inst: Air.Inst.Index, tag: Air.Inst.Tag) !void {
+    const mod = self.bin_file.options.module.?;
     const bin_op = self.air.instructions.items(.data)[inst].bin_op;
     const dst_mcv = try self.genBinOp(inst, tag, bin_op.lhs, bin_op.rhs);
+
+    const dst_ty = self.typeOfIndex(inst);
+    if (dst_ty.isAbiInt(mod)) {
+        const abi_size: u32 = @intCast(dst_ty.abiSize(mod));
+        const bit_size: u32 = @intCast(dst_ty.bitSize(mod));
+        if (abi_size * 8 > bit_size) {
+            const dst_lock = switch (dst_mcv) {
+                .register => |dst_reg| self.register_manager.lockRegAssumeUnused(dst_reg),
+                else => null,
+            };
+            defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+            if (dst_mcv.isRegister()) {
+                try self.truncateRegister(dst_ty, dst_mcv.getReg().?);
+            } else {
+                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                defer self.register_manager.unlockReg(tmp_lock);
+
+                const hi_ty = try mod.intType(.unsigned, @intCast((dst_ty.bitSize(mod) - 1) % 64 + 1));
+                const hi_mcv = dst_mcv.address().offset(@intCast(bit_size / 64 * 8)).deref();
+                try self.genSetReg(tmp_reg, hi_ty, hi_mcv);
+                try self.truncateRegister(dst_ty, tmp_reg);
+                try self.genCopy(hi_ty, hi_mcv, .{ .register = tmp_reg });
+            }
+        }
+    }
     return self.finishAir(inst, dst_mcv, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
@@ -3176,7 +3240,7 @@ fn airMulDivBinOp(self: *Self, inst: Air.Inst.Index) !void {
                     if (mat_lhs_mcv.isMemory()) try self.asmRegisterMemory(
                         .{ ._, .mov },
                         tmp_reg,
-                        mat_lhs_mcv.address().offset(8).deref().mem(.qword),
+                        try mat_lhs_mcv.address().offset(8).deref().mem(self, .qword),
                     ) else try self.asmRegisterRegister(
                         .{ ._, .mov },
                         tmp_reg,
@@ -3200,7 +3264,7 @@ fn airMulDivBinOp(self: *Self, inst: Air.Inst.Index) !void {
                     if (mat_rhs_mcv.isMemory()) try self.asmRegisterMemory(
                         .{ ._, .xor },
                         tmp_reg,
-                        mat_rhs_mcv.address().offset(8).deref().mem(.qword),
+                        try mat_rhs_mcv.address().offset(8).deref().mem(self, .qword),
                     ) else try self.asmRegisterRegister(
                         .{ ._, .xor },
                         tmp_reg,
@@ -3300,12 +3364,12 @@ fn airMulDivBinOp(self: *Self, inst: Air.Inst.Index) !void {
                             try self.asmRegisterMemory(
                                 .{ ._, .add },
                                 tmp_regs[0],
-                                mat_rhs_mcv.mem(.qword),
+                                try mat_rhs_mcv.mem(self, .qword),
                             );
                             try self.asmRegisterMemory(
                                 .{ ._, .adc },
                                 tmp_regs[1],
-                                mat_rhs_mcv.address().offset(8).deref().mem(.qword),
+                                try mat_rhs_mcv.address().offset(8).deref().mem(self, .qword),
                             );
                         } else for (
                             [_]Mir.Inst.Tag{ .add, .adc },
@@ -3534,7 +3598,7 @@ fn airMulSat(self: *Self, inst: Air.Inst.Index) !void {
             if (mat_lhs_mcv.isMemory()) try self.asmRegisterMemory(
                 .{ ._, .mov },
                 tmp_reg,
-                mat_lhs_mcv.address().offset(8).deref().mem(.qword),
+                try mat_lhs_mcv.address().offset(8).deref().mem(self, .qword),
             ) else try self.asmRegisterRegister(
                 .{ ._, .mov },
                 tmp_reg,
@@ -3558,7 +3622,7 @@ fn airMulSat(self: *Self, inst: Air.Inst.Index) !void {
             if (mat_rhs_mcv.isMemory()) try self.asmRegisterMemory(
                 .{ ._, .xor },
                 tmp_reg,
-                mat_rhs_mcv.address().offset(8).deref().mem(.qword),
+                try mat_rhs_mcv.address().offset(8).deref().mem(self, .qword),
             ) else try self.asmRegisterRegister(
                 .{ ._, .xor },
                 tmp_reg,
@@ -3567,7 +3631,7 @@ fn airMulSat(self: *Self, inst: Air.Inst.Index) !void {
 
             try self.asmRegisterImmediate(.{ ._r, .sa }, tmp_reg, Immediate.u(63));
             try self.asmRegister(.{ ._, .not }, tmp_reg);
-            try self.asmMemoryImmediate(.{ ._, .cmp }, overflow.mem(.dword), Immediate.s(0));
+            try self.asmMemoryImmediate(.{ ._, .cmp }, try overflow.mem(self, .dword), Immediate.s(0));
             try self.freeValue(overflow);
             try self.asmCmovccRegisterRegister(.ne, dst_mcv.register_pair[0], tmp_reg);
             try self.asmRegisterImmediate(.{ ._c, .bt }, tmp_reg, Immediate.u(63));
@@ -3665,7 +3729,7 @@ fn airAddSubWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     }
 
                     const frame_index =
-                        try self.allocFrameIndex(FrameAlloc.initType(tuple_ty, mod));
+                        try self.allocFrameIndex(FrameAlloc.initSpill(tuple_ty, mod));
                     try self.genSetMem(
                         .{ .frame = frame_index },
                         @intCast(tuple_ty.structFieldOffset(1, mod)),
@@ -3682,7 +3746,7 @@ fn airAddSubWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                 }
 
                 const frame_index =
-                    try self.allocFrameIndex(FrameAlloc.initType(tuple_ty, mod));
+                    try self.allocFrameIndex(FrameAlloc.initSpill(tuple_ty, mod));
                 try self.genSetFrameTruncatedOverflowCompare(tuple_ty, frame_index, partial_mcv, cc);
                 break :result .{ .load_frame = .{ .index = frame_index } };
             },
@@ -3738,7 +3802,7 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     }
 
                     const frame_index =
-                        try self.allocFrameIndex(FrameAlloc.initType(tuple_ty, mod));
+                        try self.allocFrameIndex(FrameAlloc.initSpill(tuple_ty, mod));
                     try self.genSetMem(
                         .{ .frame = frame_index },
                         @intCast(tuple_ty.structFieldOffset(1, mod)),
@@ -3755,7 +3819,7 @@ fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                 }
 
                 const frame_index =
-                    try self.allocFrameIndex(FrameAlloc.initType(tuple_ty, mod));
+                    try self.allocFrameIndex(FrameAlloc.initSpill(tuple_ty, mod));
                 try self.genSetFrameTruncatedOverflowCompare(tuple_ty, frame_index, partial_mcv, cc);
                 break :result .{ .load_frame = .{ .index = frame_index } };
             },
@@ -3874,7 +3938,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     );
                     try self.asmMemoryImmediate(
                         .{ ._, .cmp },
-                        overflow.mem(self.memSize(Type.c_int)),
+                        try overflow.mem(self, self.memSize(Type.c_int)),
                         Immediate.s(0),
                     );
                     try self.genSetMem(
@@ -3926,14 +3990,19 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     };
                     defer if (mat_rhs_lock) |lock| self.register_manager.unlockReg(lock);
 
-                    if (mat_lhs_mcv.isMemory())
-                        try self.asmRegisterMemory(.{ ._, .mov }, .rax, mat_lhs_mcv.mem(.qword))
-                    else
-                        try self.asmRegisterRegister(.{ ._, .mov }, .rax, mat_lhs_mcv.register_pair[0]);
+                    if (mat_lhs_mcv.isMemory()) try self.asmRegisterMemory(
+                        .{ ._, .mov },
+                        .rax,
+                        try mat_lhs_mcv.mem(self, .qword),
+                    ) else try self.asmRegisterRegister(
+                        .{ ._, .mov },
+                        .rax,
+                        mat_lhs_mcv.register_pair[0],
+                    );
                     if (mat_rhs_mcv.isMemory()) try self.asmRegisterMemory(
                         .{ ._, .mov },
                         tmp_regs[0],
-                        mat_rhs_mcv.address().offset(8).deref().mem(.qword),
+                        try mat_rhs_mcv.address().offset(8).deref().mem(self, .qword),
                     ) else try self.asmRegisterRegister(
                         .{ ._, .mov },
                         tmp_regs[0],
@@ -3944,7 +4013,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     try self.asmRegisterRegister(.{ .i_, .mul }, tmp_regs[0], .rax);
                     try self.asmSetccRegister(.o, tmp_regs[2].to8());
                     if (mat_rhs_mcv.isMemory())
-                        try self.asmMemory(.{ ._, .mul }, mat_rhs_mcv.mem(.qword))
+                        try self.asmMemory(.{ ._, .mul }, try mat_rhs_mcv.mem(self, .qword))
                     else
                         try self.asmRegister(.{ ._, .mul }, mat_rhs_mcv.register_pair[0]);
                     try self.asmRegisterRegister(.{ ._, .add }, .rdx, tmp_regs[0]);
@@ -3953,7 +4022,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     if (mat_lhs_mcv.isMemory()) try self.asmRegisterMemory(
                         .{ ._, .mov },
                         tmp_regs[0],
-                        mat_lhs_mcv.address().offset(8).deref().mem(.qword),
+                        try mat_lhs_mcv.address().offset(8).deref().mem(self, .qword),
                     ) else try self.asmRegisterRegister(
                         .{ ._, .mov },
                         tmp_regs[0],
@@ -3967,14 +4036,15 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                         tmp_regs[3].to8(),
                     );
                     try self.asmRegisterRegister(.{ ._, .@"or" }, tmp_regs[1].to8(), tmp_regs[2].to8());
-                    if (mat_rhs_mcv.isMemory())
-                        try self.asmRegisterMemory(.{ .i_, .mul }, tmp_regs[0], mat_rhs_mcv.mem(.qword))
-                    else
-                        try self.asmRegisterRegister(
-                            .{ .i_, .mul },
-                            tmp_regs[0],
-                            mat_rhs_mcv.register_pair[0],
-                        );
+                    if (mat_rhs_mcv.isMemory()) try self.asmRegisterMemory(
+                        .{ .i_, .mul },
+                        tmp_regs[0],
+                        try mat_rhs_mcv.mem(self, .qword),
+                    ) else try self.asmRegisterRegister(
+                        .{ .i_, .mul },
+                        tmp_regs[0],
+                        mat_rhs_mcv.register_pair[0],
+                    );
                     try self.asmSetccRegister(.o, tmp_regs[2].to8());
                     try self.asmRegisterRegister(.{ ._, .@"or" }, tmp_regs[1].to8(), tmp_regs[2].to8());
                     try self.asmRegisterRegister(.{ ._, .add }, .rdx, tmp_regs[0]);
@@ -4020,8 +4090,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     self.eflags_inst = inst;
                     break :result .{ .register_overflow = .{ .reg = reg, .eflags = cc } };
                 } else {
-                    const frame_index =
-                        try self.allocFrameIndex(FrameAlloc.initType(tuple_ty, mod));
+                    const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(tuple_ty, mod));
                     try self.genSetFrameTruncatedOverflowCompare(tuple_ty, frame_index, partial_mcv, cc);
                     break :result .{ .load_frame = .{ .index = frame_index } };
                 },
@@ -4032,8 +4101,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                             src_ty.fmt(mod), dst_ty.fmt(mod),
                         });
 
-                    const frame_index =
-                        try self.allocFrameIndex(FrameAlloc.initType(tuple_ty, mod));
+                    const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(tuple_ty, mod));
                     if (dst_info.bits >= lhs_active_bits + rhs_active_bits) {
                         try self.genSetMem(
                             .{ .frame = frame_index },
@@ -4106,7 +4174,7 @@ fn genIntMulDivOpMir(self: *Self, tag: Mir.Inst.FixedTag, ty: Type, lhs: MCValue
         .register => |reg| try self.asmRegister(tag, registerAlias(reg, abi_size)),
         .memory, .indirect, .load_frame => try self.asmMemory(
             tag,
-            mat_rhs.mem(Memory.Size.fromSize(abi_size)),
+            try mat_rhs.mem(self, Memory.Size.fromSize(abi_size)),
         ),
         else => unreachable,
     }
@@ -4160,8 +4228,8 @@ fn genInlineIntDivFloor(self: *Self, ty: Type, lhs: MCValue, rhs: MCValue) !MCVa
     );
     try self.asmCmovccRegisterRegister(
         .z,
-        registerAlias(divisor, abi_size),
-        registerAlias(.rdx, abi_size),
+        registerAlias(divisor, @max(abi_size, 2)),
+        registerAlias(.rdx, @max(abi_size, 2)),
     );
     try self.genBinOpMir(.{ ._, .add }, ty, .{ .register = divisor }, .{ .register = .rax });
     return MCValue{ .register = divisor };
@@ -4171,47 +4239,268 @@ fn airShlShrBinOp(self: *Self, inst: Air.Inst.Index) !void {
     const mod = self.bin_file.options.module.?;
     const bin_op = self.air.instructions.items(.data)[inst].bin_op;
 
-    try self.spillRegisters(&.{.rcx});
-
-    const tag = self.air.instructions.items(.tag)[inst];
-    try self.register_manager.getReg(.rcx, null);
-    const lhs = try self.resolveInst(bin_op.lhs);
-    const rhs = try self.resolveInst(bin_op.rhs);
+    const air_tags = self.air.instructions.items(.tag);
+    const tag = air_tags[inst];
     const lhs_ty = self.typeOf(bin_op.lhs);
     const rhs_ty = self.typeOf(bin_op.rhs);
+    const result: MCValue = result: {
+        switch (lhs_ty.zigTypeTag(mod)) {
+            .Int => {
+                try self.spillRegisters(&.{.rcx});
+                try self.register_manager.getReg(.rcx, null);
+                const lhs_mcv = try self.resolveInst(bin_op.lhs);
+                const rhs_mcv = try self.resolveInst(bin_op.rhs);
 
-    const dst_mcv = try self.genShiftBinOp(tag, inst, lhs, rhs, lhs_ty, rhs_ty);
-    switch (tag) {
-        .shr, .shr_exact, .shl_exact => {},
-        .shl => switch (dst_mcv) {
-            .register => |dst_reg| try self.truncateRegister(lhs_ty, dst_reg),
-            .register_pair => |dst_regs| try self.truncateRegister(lhs_ty, dst_regs[1]),
-            .load_frame => |frame_addr| {
-                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
-                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-                defer self.register_manager.unlockReg(tmp_lock);
+                const dst_mcv = try self.genShiftBinOp(tag, inst, lhs_mcv, rhs_mcv, lhs_ty, rhs_ty);
+                switch (tag) {
+                    .shr, .shr_exact, .shl_exact => {},
+                    .shl => switch (dst_mcv) {
+                        .register => |dst_reg| try self.truncateRegister(lhs_ty, dst_reg),
+                        .register_pair => |dst_regs| try self.truncateRegister(lhs_ty, dst_regs[1]),
+                        .load_frame => |frame_addr| {
+                            const tmp_reg =
+                                try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                            const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                            defer self.register_manager.unlockReg(tmp_lock);
 
-                const lhs_bits: u31 = @intCast(lhs_ty.bitSize(mod));
-                const tmp_ty = if (lhs_bits > 64) Type.usize else lhs_ty;
-                const off = frame_addr.off + lhs_bits / 64 * 8;
-                try self.genSetReg(
-                    tmp_reg,
-                    tmp_ty,
-                    .{ .load_frame = .{ .index = frame_addr.index, .off = off } },
-                );
-                try self.truncateRegister(lhs_ty, tmp_reg);
-                try self.genSetMem(
-                    .{ .frame = frame_addr.index },
-                    off,
-                    tmp_ty,
-                    .{ .register = tmp_reg },
-                );
+                            const lhs_bits: u31 = @intCast(lhs_ty.bitSize(mod));
+                            const tmp_ty = if (lhs_bits > 64) Type.usize else lhs_ty;
+                            const off = frame_addr.off + (lhs_bits - 1) / 64 * 8;
+                            try self.genSetReg(
+                                tmp_reg,
+                                tmp_ty,
+                                .{ .load_frame = .{ .index = frame_addr.index, .off = off } },
+                            );
+                            try self.truncateRegister(lhs_ty, tmp_reg);
+                            try self.genSetMem(
+                                .{ .frame = frame_addr.index },
+                                off,
+                                tmp_ty,
+                                .{ .register = tmp_reg },
+                            );
+                        },
+                        else => {},
+                    },
+                    else => unreachable,
+                }
+                break :result dst_mcv;
+            },
+            .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
+                .Int => if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.childType(mod).intInfo(mod).bits) {
+                    else => null,
+                    16 => switch (lhs_ty.vectorLen(mod)) {
+                        else => null,
+                        1...8 => switch (tag) {
+                            else => unreachable,
+                            .shr, .shr_exact => switch (lhs_ty.childType(mod).intInfo(mod).signedness) {
+                                .signed => if (self.hasFeature(.avx))
+                                    .{ .vp_w, .sra }
+                                else
+                                    .{ .p_w, .sra },
+                                .unsigned => if (self.hasFeature(.avx))
+                                    .{ .vp_w, .srl }
+                                else
+                                    .{ .p_w, .srl },
+                            },
+                            .shl, .shl_exact => if (self.hasFeature(.avx))
+                                .{ .vp_w, .sll }
+                            else
+                                .{ .p_w, .sll },
+                        },
+                        9...16 => switch (tag) {
+                            else => unreachable,
+                            .shr, .shr_exact => switch (lhs_ty.childType(mod).intInfo(mod).signedness) {
+                                .signed => if (self.hasFeature(.avx2)) .{ .vp_w, .sra } else null,
+                                .unsigned => if (self.hasFeature(.avx2)) .{ .vp_w, .srl } else null,
+                            },
+                            .shl, .shl_exact => if (self.hasFeature(.avx2)) .{ .vp_w, .sll } else null,
+                        },
+                    },
+                    32 => switch (lhs_ty.vectorLen(mod)) {
+                        else => null,
+                        1...4 => switch (tag) {
+                            else => unreachable,
+                            .shr, .shr_exact => switch (lhs_ty.childType(mod).intInfo(mod).signedness) {
+                                .signed => if (self.hasFeature(.avx))
+                                    .{ .vp_d, .sra }
+                                else
+                                    .{ .p_d, .sra },
+                                .unsigned => if (self.hasFeature(.avx))
+                                    .{ .vp_d, .srl }
+                                else
+                                    .{ .p_d, .srl },
+                            },
+                            .shl, .shl_exact => if (self.hasFeature(.avx))
+                                .{ .vp_d, .sll }
+                            else
+                                .{ .p_d, .sll },
+                        },
+                        5...8 => switch (tag) {
+                            else => unreachable,
+                            .shr, .shr_exact => switch (lhs_ty.childType(mod).intInfo(mod).signedness) {
+                                .signed => if (self.hasFeature(.avx2)) .{ .vp_d, .sra } else null,
+                                .unsigned => if (self.hasFeature(.avx2)) .{ .vp_d, .srl } else null,
+                            },
+                            .shl, .shl_exact => if (self.hasFeature(.avx2)) .{ .vp_d, .sll } else null,
+                        },
+                    },
+                    64 => switch (lhs_ty.vectorLen(mod)) {
+                        else => null,
+                        1...2 => switch (tag) {
+                            else => unreachable,
+                            .shr, .shr_exact => switch (lhs_ty.childType(mod).intInfo(mod).signedness) {
+                                .signed => if (self.hasFeature(.avx))
+                                    .{ .vp_q, .sra }
+                                else
+                                    .{ .p_q, .sra },
+                                .unsigned => if (self.hasFeature(.avx))
+                                    .{ .vp_q, .srl }
+                                else
+                                    .{ .p_q, .srl },
+                            },
+                            .shl, .shl_exact => if (self.hasFeature(.avx))
+                                .{ .vp_q, .sll }
+                            else
+                                .{ .p_q, .sll },
+                        },
+                        3...4 => switch (tag) {
+                            else => unreachable,
+                            .shr, .shr_exact => switch (lhs_ty.childType(mod).intInfo(mod).signedness) {
+                                .signed => if (self.hasFeature(.avx2)) .{ .vp_q, .sra } else null,
+                                .unsigned => if (self.hasFeature(.avx2)) .{ .vp_q, .srl } else null,
+                            },
+                            .shl, .shl_exact => if (self.hasFeature(.avx2)) .{ .vp_q, .sll } else null,
+                        },
+                    },
+                })) |mir_tag| if (try self.air.value(bin_op.rhs, mod)) |rhs_val| {
+                    switch (mod.intern_pool.indexToKey(rhs_val.toIntern())) {
+                        .aggregate => |rhs_aggregate| switch (rhs_aggregate.storage) {
+                            .repeated_elem => |rhs_elem| {
+                                const abi_size: u32 = @intCast(lhs_ty.abiSize(mod));
+
+                                const lhs_mcv = try self.resolveInst(bin_op.lhs);
+                                const dst_reg, const lhs_reg = if (lhs_mcv.isRegister() and
+                                    self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv))
+                                    .{lhs_mcv.getReg().?} ** 2
+                                else if (lhs_mcv.isRegister() and self.hasFeature(.avx)) .{
+                                    try self.register_manager.allocReg(inst, abi.RegisterClass.sse),
+                                    lhs_mcv.getReg().?,
+                                } else .{(try self.copyToRegisterWithInstTracking(
+                                    inst,
+                                    lhs_ty,
+                                    lhs_mcv,
+                                )).register} ** 2;
+                                const reg_locks =
+                                    self.register_manager.lockRegs(2, .{ dst_reg, lhs_reg });
+                                defer for (reg_locks) |reg_lock| if (reg_lock) |lock|
+                                    self.register_manager.unlockReg(lock);
+
+                                const shift_imm =
+                                    Immediate.u(@intCast(rhs_elem.toValue().toUnsignedInt(mod)));
+                                if (self.hasFeature(.avx)) try self.asmRegisterRegisterImmediate(
+                                    mir_tag,
+                                    registerAlias(dst_reg, abi_size),
+                                    registerAlias(lhs_reg, abi_size),
+                                    shift_imm,
+                                ) else {
+                                    assert(dst_reg.id() == lhs_reg.id());
+                                    try self.asmRegisterImmediate(
+                                        mir_tag,
+                                        registerAlias(dst_reg, abi_size),
+                                        shift_imm,
+                                    );
+                                }
+                                break :result .{ .register = dst_reg };
+                            },
+                            else => {},
+                        },
+                        else => {},
+                    }
+                } else if (Air.refToIndex(bin_op.rhs)) |rhs_inst| switch (air_tags[rhs_inst]) {
+                    .splat => {
+                        const abi_size: u32 = @intCast(lhs_ty.abiSize(mod));
+
+                        const lhs_mcv = try self.resolveInst(bin_op.lhs);
+                        const dst_reg, const lhs_reg = if (lhs_mcv.isRegister() and
+                            self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv))
+                            .{lhs_mcv.getReg().?} ** 2
+                        else if (lhs_mcv.isRegister() and self.hasFeature(.avx)) .{
+                            try self.register_manager.allocReg(inst, abi.RegisterClass.sse),
+                            lhs_mcv.getReg().?,
+                        } else .{(try self.copyToRegisterWithInstTracking(
+                            inst,
+                            lhs_ty,
+                            lhs_mcv,
+                        )).register} ** 2;
+                        const reg_locks = self.register_manager.lockRegs(2, .{ dst_reg, lhs_reg });
+                        defer for (reg_locks) |reg_lock| if (reg_lock) |lock|
+                            self.register_manager.unlockReg(lock);
+
+                        const shift_reg =
+                            try self.copyToTmpRegister(rhs_ty, .{ .air_ref = bin_op.rhs });
+                        const shift_lock = self.register_manager.lockRegAssumeUnused(shift_reg);
+                        defer self.register_manager.unlockReg(shift_lock);
+
+                        const mask_ty = try mod.vectorType(.{ .len = 16, .child = .u8_type });
+                        const mask_mcv = try self.genTypedValue(.{
+                            .ty = mask_ty,
+                            .val = (try mod.intern(.{ .aggregate = .{
+                                .ty = mask_ty.toIntern(),
+                                .storage = .{ .elems = &([1]InternPool.Index{
+                                    (try rhs_ty.childType(mod).maxIntScalar(mod, Type.u8)).toIntern(),
+                                } ++ [1]InternPool.Index{
+                                    (try mod.intValue(Type.u8, 0)).toIntern(),
+                                } ** 15) },
+                            } })).toValue(),
+                        });
+                        const mask_addr_reg =
+                            try self.copyToTmpRegister(Type.usize, mask_mcv.address());
+                        const mask_addr_lock = self.register_manager.lockRegAssumeUnused(mask_addr_reg);
+                        defer self.register_manager.unlockReg(mask_addr_lock);
+
+                        if (self.hasFeature(.avx)) {
+                            try self.asmRegisterRegisterMemory(
+                                .{ .vp_, .@"and" },
+                                shift_reg.to128(),
+                                shift_reg.to128(),
+                                .{
+                                    .base = .{ .reg = mask_addr_reg },
+                                    .mod = .{ .rm = .{ .size = .xword } },
+                                },
+                            );
+                            try self.asmRegisterRegisterRegister(
+                                mir_tag,
+                                registerAlias(dst_reg, abi_size),
+                                registerAlias(lhs_reg, abi_size),
+                                shift_reg.to128(),
+                            );
+                        } else {
+                            try self.asmRegisterMemory(
+                                .{ .p_, .@"and" },
+                                shift_reg.to128(),
+                                .{
+                                    .base = .{ .reg = mask_addr_reg },
+                                    .mod = .{ .rm = .{ .size = .xword } },
+                                },
+                            );
+                            assert(dst_reg.id() == lhs_reg.id());
+                            try self.asmRegisterRegister(
+                                mir_tag,
+                                registerAlias(dst_reg, abi_size),
+                                shift_reg.to128(),
+                            );
+                        }
+                        break :result .{ .register = dst_reg };
+                    },
+                    else => {},
+                },
+                else => {},
             },
             else => {},
-        },
-        else => unreachable,
-    }
-    return self.finishAir(inst, dst_mcv, .{ bin_op.lhs, bin_op.rhs, .none });
+        }
+        return self.fail("TODO implement airShlShrBinOp for {}", .{lhs_ty.fmt(mod)});
+    };
+    return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
 fn airShlSat(self: *Self, inst: Air.Inst.Index) !void {
@@ -4230,12 +4519,18 @@ fn airOptionalPayload(self: *Self, inst: Air.Inst.Index) !void {
 
         const opt_mcv = try self.resolveInst(ty_op.operand);
         if (self.reuseOperand(inst, ty_op.operand, 0, opt_mcv)) {
-            switch (opt_mcv) {
-                .register => |reg| try self.truncateRegister(pl_ty, reg),
-                .register_overflow => |ro| try self.truncateRegister(pl_ty, ro.reg),
+            const pl_mcv: MCValue = switch (opt_mcv) {
+                .register_overflow => |ro| pl: {
+                    self.eflags_inst = null; // actually stop tracking the overflow part
+                    break :pl .{ .register = ro.reg };
+                },
+                else => opt_mcv,
+            };
+            switch (pl_mcv) {
+                .register => |pl_reg| try self.truncateRegister(pl_ty, pl_reg),
                 else => {},
             }
-            break :result opt_mcv;
+            break :result pl_mcv;
         }
 
         const pl_mcv = try self.allocRegOrMem(inst, true);
@@ -4472,8 +4767,9 @@ fn genUnwrapErrUnionPayloadMir(
                 const eu_lock = self.register_manager.lockReg(reg);
                 defer if (eu_lock) |lock| self.register_manager.unlockReg(lock);
 
-                const result_mcv: MCValue = if (maybe_inst) |inst|
-                    try self.copyToRegisterWithInstTracking(inst, err_union_ty, err_union)
+                const payload_in_gp = self.regClassForType(payload_ty).supersetOf(abi.RegisterClass.gp);
+                const result_mcv: MCValue = if (payload_in_gp and maybe_inst != null)
+                    try self.copyToRegisterWithInstTracking(maybe_inst.?, err_union_ty, err_union)
                 else
                     .{ .register = try self.copyToTmpRegister(err_union_ty, err_union) };
                 if (payload_off > 0) try self.genShiftBinOpMir(
@@ -4482,7 +4778,12 @@ fn genUnwrapErrUnionPayloadMir(
                     result_mcv,
                     .{ .immediate = @as(u6, @intCast(payload_off * 8)) },
                 ) else try self.truncateRegister(payload_ty, result_mcv.register);
-                break :result result_mcv;
+                break :result if (payload_in_gp)
+                    result_mcv
+                else if (maybe_inst) |inst|
+                    try self.copyToRegisterWithInstTracking(inst, payload_ty, result_mcv)
+                else
+                    .{ .register = try self.copyToTmpRegister(payload_ty, result_mcv) };
             },
             else => return self.fail("TODO implement genUnwrapErrUnionPayloadMir for {}", .{err_union}),
         }
@@ -4593,7 +4894,7 @@ fn airWrapErrUnionPayload(self: *Self, inst: Air.Inst.Index) !void {
     const result: MCValue = result: {
         if (!pl_ty.hasRuntimeBitsIgnoreComptime(mod)) break :result .{ .immediate = 0 };
 
-        const frame_index = try self.allocFrameIndex(FrameAlloc.initType(eu_ty, mod));
+        const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(eu_ty, mod));
         const pl_off: i32 = @intCast(errUnionPayloadOffset(pl_ty, mod));
         const err_off: i32 = @intCast(errUnionErrorOffset(pl_ty, mod));
         try self.genSetMem(.{ .frame = frame_index }, pl_off, pl_ty, operand);
@@ -4615,7 +4916,7 @@ fn airWrapErrUnionErr(self: *Self, inst: Air.Inst.Index) !void {
     const result: MCValue = result: {
         if (!pl_ty.hasRuntimeBitsIgnoreComptime(mod)) break :result try self.resolveInst(ty_op.operand);
 
-        const frame_index = try self.allocFrameIndex(FrameAlloc.initType(eu_ty, mod));
+        const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(eu_ty, mod));
         const pl_off: i32 = @intCast(errUnionPayloadOffset(pl_ty, mod));
         const err_off: i32 = @intCast(errUnionErrorOffset(pl_ty, mod));
         try self.genSetMem(.{ .frame = frame_index }, pl_off, pl_ty, .undef);
@@ -4770,14 +5071,19 @@ fn genSliceElemPtr(self: *Self, lhs: Air.Inst.Ref, rhs: Air.Inst.Ref) !MCValue {
 fn airSliceElemVal(self: *Self, inst: Air.Inst.Index) !void {
     const mod = self.bin_file.options.module.?;
     const bin_op = self.air.instructions.items(.data)[inst].bin_op;
-    const slice_ty = self.typeOf(bin_op.lhs);
 
-    const slice_ptr_field_type = slice_ty.slicePtrFieldType(mod);
-    const elem_ptr = try self.genSliceElemPtr(bin_op.lhs, bin_op.rhs);
-    const dst_mcv = try self.allocRegOrMem(inst, false);
-    try self.load(dst_mcv, slice_ptr_field_type, elem_ptr);
+    const result: MCValue = result: {
+        const elem_ty = self.typeOfIndex(inst);
+        if (!elem_ty.hasRuntimeBitsIgnoreComptime(mod)) break :result .none;
 
-    return self.finishAir(inst, dst_mcv, .{ bin_op.lhs, bin_op.rhs, .none });
+        const slice_ty = self.typeOf(bin_op.lhs);
+        const slice_ptr_field_type = slice_ty.slicePtrFieldType(mod);
+        const elem_ptr = try self.genSliceElemPtr(bin_op.lhs, bin_op.rhs);
+        const dst_mcv = try self.allocRegOrMem(inst, false);
+        try self.load(dst_mcv, slice_ptr_field_type, elem_ptr);
+        break :result dst_mcv;
+    };
+    return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
 }
 
 fn airSliceElemPtr(self: *Self, inst: Air.Inst.Index) !void {
@@ -4810,11 +5116,10 @@ fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) !void {
     };
     defer if (index_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const offset_reg = try self.elemOffset(index_ty, index, elem_abi_size);
-    const offset_reg_lock = self.register_manager.lockRegAssumeUnused(offset_reg);
-    defer self.register_manager.unlockReg(offset_reg_lock);
-
     const addr_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+    const addr_lock = self.register_manager.lockRegAssumeUnused(addr_reg);
+    defer self.register_manager.unlockReg(addr_lock);
+
     switch (array) {
         .register => {
             const frame_index = try self.allocFrameIndex(FrameAlloc.initType(array_ty, mod));
@@ -4843,6 +5148,10 @@ fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) !void {
         else => return self.fail("TODO implement array_elem_val when array is {}", .{array}),
     }
 
+    const offset_reg = try self.elemOffset(index_ty, index, elem_abi_size);
+    const offset_lock = self.register_manager.lockRegAssumeUnused(offset_reg);
+    defer self.register_manager.unlockReg(offset_lock);
+
     // TODO we could allocate register here, but need to expect addr register and potentially
     // offset register.
     try self.spillEflagsIfOccupied();
@@ -5093,7 +5402,7 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void {
                     .{ ._, .sub },
                     dst_ty,
                     dst_mcv,
-                    .{ .immediate = 8 + self.regExtraBits(src_ty) },
+                    .{ .immediate = 32 - src_bits },
                 );
             } else if (src_bits <= 64) {
                 try self.genBinOpMir(.{ ._, .lzcnt }, src_ty, dst_mcv, mat_src_mcv);
@@ -5361,7 +5670,9 @@ fn airPopCount(self: *Self, inst: Air.Inst.Index) !void {
             mat_src_mcv
         else
             .{ .register = mat_src_mcv.register_pair[0] }, false);
-        try self.genPopCount(tmp_regs[1], Type.usize, if (mat_src_mcv.isMemory())
+        const src_info = src_ty.intInfo(mod);
+        const hi_ty = try mod.intType(src_info.signedness, (src_info.bits - 1) % 64 + 1);
+        try self.genPopCount(tmp_regs[1], hi_ty, if (mat_src_mcv.isMemory())
             mat_src_mcv.address().offset(8).deref()
         else
             .{ .register = mat_src_mcv.register_pair[1] }, false);
@@ -5383,9 +5694,13 @@ fn genPopCount(
     const src_abi_size: u32 = @intCast(src_ty.abiSize(mod));
     if (self.hasFeature(.popcnt)) return self.genBinOpMir(
         .{ ._, .popcnt },
-        if (src_abi_size > 1) src_ty else Type.u16,
+        if (src_abi_size > 1) src_ty else Type.u32,
         .{ .register = dst_reg },
-        src_mcv,
+        if (src_abi_size > 1) src_mcv else src: {
+            if (!dst_contains_src) try self.genSetReg(dst_reg, src_ty, src_mcv);
+            try self.truncateRegister(try src_ty.toUnsigned(mod), dst_reg);
+            break :src .{ .register = dst_reg };
+        },
     );
 
     const mask = @as(u64, math.maxInt(u64)) >> @intCast(64 - src_abi_size * 8);
@@ -5517,9 +5832,9 @@ fn genByteSwap(
                 try self.asmRegisterMemory(
                     .{ ._, .movbe },
                     dst_regs[0],
-                    src_mcv.address().offset(8).deref().mem(.qword),
+                    try src_mcv.address().offset(8).deref().mem(self, .qword),
                 );
-                try self.asmRegisterMemory(.{ ._, .movbe }, dst_regs[1], src_mcv.mem(.qword));
+                try self.asmRegisterMemory(.{ ._, .movbe }, dst_regs[1], try src_mcv.mem(self, .qword));
             } else for (dst_regs, src_mcv.register_pair) |dst_reg, src_reg| {
                 try self.asmRegisterRegister(.{ ._, .mov }, dst_reg.to64(), src_reg.to64());
                 try self.asmRegister(.{ ._, .bswap }, dst_reg.to64());
@@ -5762,7 +6077,7 @@ fn floatSign(self: *Self, inst: Air.Inst.Index, operand: Air.Inst.Ref, ty: Type)
             else => unreachable,
         } });
         const sign_mem: Memory = if (sign_mcv.isMemory())
-            sign_mcv.mem(Memory.Size.fromSize(abi_size))
+            try sign_mcv.mem(self, Memory.Size.fromSize(abi_size))
         else
             .{
                 .base = .{ .reg = try self.copyToTmpRegister(Type.usize, sign_mcv.address()) },
@@ -5945,7 +6260,7 @@ fn genRound(self: *Self, ty: Type, dst_reg: Register, src_mcv: MCValue, mode: Ro
             mir_tag,
             dst_alias,
             dst_alias,
-            src_mcv.mem(Memory.Size.fromSize(abi_size)),
+            try src_mcv.mem(self, Memory.Size.fromSize(abi_size)),
             Immediate.u(@as(u5, @bitCast(mode))),
         ) else try self.asmRegisterRegisterRegisterImmediate(
             mir_tag,
@@ -5960,7 +6275,7 @@ fn genRound(self: *Self, ty: Type, dst_reg: Register, src_mcv: MCValue, mode: Ro
         else => if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
             mir_tag,
             dst_alias,
-            src_mcv.mem(Memory.Size.fromSize(abi_size)),
+            try src_mcv.mem(self, Memory.Size.fromSize(abi_size)),
             Immediate.u(@as(u5, @bitCast(mode))),
         ) else try self.asmRegisterRegisterImmediate(
             mir_tag,
@@ -6000,7 +6315,7 @@ fn airAbs(self: *Self, inst: Air.Inst.Index) !void {
                         .memory, .indirect, .load_frame => try self.asmCmovccRegisterMemory(
                             .l,
                             registerAlias(dst_mcv.register, cmov_abi_size),
-                            src_mcv.mem(Memory.Size.fromSize(cmov_abi_size)),
+                            try src_mcv.mem(self, Memory.Size.fromSize(cmov_abi_size)),
                         ),
                         else => {
                             const val_reg = try self.copyToTmpRegister(ty, src_mcv);
@@ -6100,7 +6415,7 @@ fn airAbs(self: *Self, inst: Air.Inst.Index) !void {
         if (src_mcv.isMemory()) try self.asmRegisterMemory(
             mir_tag,
             dst_alias,
-            src_mcv.mem(self.memSize(ty)),
+            try src_mcv.mem(self, self.memSize(ty)),
         ) else try self.asmRegisterRegister(
             mir_tag,
             dst_alias,
@@ -6206,7 +6521,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 .{ .v_ps, .cvtph2 },
                                 wide_reg,
-                                src_mcv.mem(Memory.Size.fromSize(
+                                try src_mcv.mem(self, Memory.Size.fromSize(
                                     @intCast(@divExact(wide_reg.bitSize(), 16)),
                                 )),
                             ) else try self.asmRegisterRegister(
@@ -6254,7 +6569,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                 mir_tag,
                 dst_reg,
                 dst_reg,
-                src_mcv.mem(Memory.Size.fromSize(abi_size)),
+                try src_mcv.mem(self, Memory.Size.fromSize(abi_size)),
             ) else try self.asmRegisterRegisterRegister(
                 mir_tag,
                 dst_reg,
@@ -6267,7 +6582,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
             else => if (src_mcv.isMemory()) try self.asmRegisterMemory(
                 mir_tag,
                 dst_reg,
-                src_mcv.mem(Memory.Size.fromSize(abi_size)),
+                try src_mcv.mem(self, Memory.Size.fromSize(abi_size)),
             ) else try self.asmRegisterRegister(
                 mir_tag,
                 dst_reg,
@@ -6332,7 +6647,7 @@ fn reuseOperandAdvanced(
         return false;
 
     switch (mcv) {
-        .register, .register_pair => for (mcv.getRegs()) |reg| {
+        .register, .register_pair, .register_overflow => for (mcv.getRegs()) |reg| {
             // If it's in the registers table, need to associate the register(s) with the
             // new instruction.
             if (maybe_tracked_inst) |tracked_inst| {
@@ -6346,6 +6661,10 @@ fn reuseOperandAdvanced(
         .load_frame => |frame_addr| if (frame_addr.index.isNamed()) return false,
         else => return false,
     }
+    switch (mcv) {
+        .eflags, .register_overflow => self.eflags_inst = maybe_tracked_inst,
+        else => {},
+    }
 
     // Prevent the operand deaths processing code from deallocating it.
     self.liveness.clearOperandDeath(inst, op_index);
@@ -6363,11 +6682,36 @@ fn packedLoad(self: *Self, dst_mcv: MCValue, ptr_ty: Type, ptr_mcv: MCValue) Inn
     if (!val_ty.hasRuntimeBitsIgnoreComptime(mod)) return;
     const val_abi_size: u32 = @intCast(val_ty.abiSize(mod));
 
+    if (ptr_info.packed_offset.bit_offset % 8 == 0) {
+        try self.load(
+            dst_mcv,
+            ptr_ty,
+            ptr_mcv.offset(@intCast(@divExact(ptr_info.packed_offset.bit_offset, 8))),
+        );
+        const val_bit_size: u32 = @intCast(val_ty.bitSize(mod));
+        if (val_abi_size * 8 > val_bit_size) {
+            if (dst_mcv.isRegister()) {
+                try self.truncateRegister(val_ty, dst_mcv.getReg().?);
+            } else {
+                const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                defer self.register_manager.unlockReg(tmp_lock);
+
+                const hi_mcv = dst_mcv.address().offset(@intCast(val_bit_size / 64 * 8)).deref();
+                try self.genSetReg(tmp_reg, Type.usize, hi_mcv);
+                try self.truncateRegister(val_ty, tmp_reg);
+                try self.genCopy(Type.usize, hi_mcv, .{ .register = tmp_reg });
+            }
+        }
+        return;
+    }
+
     if (val_abi_size > 8) return self.fail("TODO implement packed load of {}", .{val_ty.fmt(mod)});
 
     const limb_abi_size: u32 = @min(val_abi_size, 8);
     const limb_abi_bits = limb_abi_size * 8;
-    const val_byte_off: i32 = @intCast(ptr_info.packed_offset.bit_offset / limb_abi_bits * limb_abi_size);
+    const val_byte_off: i32 =
+        @intCast(ptr_info.packed_offset.bit_offset / limb_abi_bits * limb_abi_size);
     const val_bit_off = ptr_info.packed_offset.bit_offset % limb_abi_bits;
     const val_extra_bits = self.regExtraBits(val_ty);
 
@@ -6530,7 +6874,7 @@ fn packedStore(self: *Self, ptr_ty: Type, ptr_mcv: MCValue, src_mcv: MCValue) In
             .base = .{ .reg = ptr_reg },
             .mod = .{ .rm = .{
                 .size = Memory.Size.fromSize(limb_abi_size),
-                .disp = src_byte_off + limb_i * limb_abi_bits,
+                .disp = src_byte_off + limb_i * limb_abi_size,
             } },
         };
 
@@ -6575,6 +6919,22 @@ fn packedStore(self: *Self, ptr_ty: Type, ptr_mcv: MCValue, src_mcv: MCValue) In
                 limb_mem,
                 registerAlias(tmp_reg, limb_abi_size),
             );
+        } else if (src_bit_size <= 128 and src_bit_off == 0) {
+            const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+            const tmp_mcv = MCValue{ .register = tmp_reg };
+            const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+            defer self.register_manager.unlockReg(tmp_lock);
+
+            try self.genSetReg(tmp_reg, limb_ty, switch (limb_i) {
+                0 => src_mcv,
+                else => src_mcv.address().offset(limb_i * limb_abi_size).deref(),
+            });
+            try self.genBinOpMir(.{ ._, .@"and" }, limb_ty, tmp_mcv, .{ .immediate = part_mask });
+            try self.asmMemoryRegister(
+                .{ ._, .@"or" },
+                limb_mem,
+                registerAlias(tmp_reg, limb_abi_size),
+            );
         } else return self.fail("TODO: implement packed store of {}", .{src_ty.fmt(mod)});
     }
 }
@@ -6808,17 +7168,17 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
             .register_overflow => |ro| {
                 switch (index) {
                     // Get wrapped value for overflow operation.
-                    0 => break :result if (self.liveness.operandDies(inst, 0))
-                        .{ .register = ro.reg }
-                    else
-                        try self.copyToRegisterWithInstTracking(
-                            inst,
-                            Type.usize,
-                            .{ .register = ro.reg },
-                        ),
+                    0 => if (self.reuseOperand(inst, extra.struct_operand, 0, src_mcv)) {
+                        self.eflags_inst = null; // actually stop tracking the overflow part
+                        break :result .{ .register = ro.reg };
+                    } else break :result try self.copyToRegisterWithInstTracking(
+                        inst,
+                        Type.usize,
+                        .{ .register = ro.reg },
+                    ),
                     // Get overflow bit.
-                    1 => if (self.liveness.operandDies(inst, 0)) {
-                        self.eflags_inst = inst;
+                    1 => if (self.reuseOperandAdvanced(inst, extra.struct_operand, 0, src_mcv, null)) {
+                        self.eflags_inst = inst; // actually keep tracking the overflow part
                         break :result .{ .eflags = ro.eflags };
                     } else {
                         const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.gp);
@@ -6833,11 +7193,12 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
                 if (field_off % 8 == 0) {
                     const off_mcv =
                         src_mcv.address().offset(@intCast(@divExact(field_off, 8))).deref();
+                    const field_bit_size = field_ty.bitSize(mod);
 
                     if (field_abi_size <= 8) {
                         const int_ty = try mod.intType(
                             if (field_ty.isAbiInt(mod)) field_ty.intInfo(mod).signedness else .unsigned,
-                            @intCast(field_ty.bitSize(mod)),
+                            @intCast(field_bit_size),
                         );
 
                         const dst_reg = try self.register_manager.allocReg(
@@ -6856,10 +7217,24 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
                             try self.copyToRegisterWithInstTracking(inst, field_ty, dst_mcv);
                     }
 
-                    if (self.reuseOperand(inst, operand, 0, src_mcv)) break :result off_mcv;
-
-                    const dst_mcv = try self.allocRegOrMem(inst, true);
-                    try self.genCopy(field_ty, dst_mcv, off_mcv);
+                    const dst_mcv = if (self.reuseOperand(inst, operand, 0, src_mcv))
+                        off_mcv
+                    else dst: {
+                        const dst_mcv = try self.allocRegOrMem(inst, true);
+                        try self.genCopy(field_ty, dst_mcv, off_mcv);
+                        break :dst dst_mcv;
+                    };
+                    if (field_abi_size * 8 > field_bit_size and dst_mcv.isMemory()) {
+                        const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                        defer self.register_manager.unlockReg(tmp_lock);
+
+                        const hi_mcv =
+                            dst_mcv.address().offset(@intCast(field_bit_size / 64 * 8)).deref();
+                        try self.genSetReg(tmp_reg, Type.usize, hi_mcv);
+                        try self.truncateRegister(field_ty, tmp_reg);
+                        try self.genCopy(Type.usize, hi_mcv, .{ .register = tmp_reg });
+                    }
                     break :result dst_mcv;
                 }
 
@@ -7013,7 +7388,25 @@ fn genUnOp(self: *Self, maybe_inst: ?Air.Inst.Index, tag: Air.Inst.Tag, src_air:
                 } else try self.genUnOpMir(.{ ._, .not }, limb_ty, limb_mcv);
             }
         },
-        .neg => try self.genUnOpMir(.{ ._, .neg }, src_ty, dst_mcv),
+        .neg => {
+            try self.genUnOpMir(.{ ._, .neg }, src_ty, dst_mcv);
+            const abi_size: u16 = @intCast(src_ty.abiSize(mod));
+            const bit_size = src_ty.intInfo(mod).bits;
+            if (abi_size * 8 > bit_size) {
+                if (dst_mcv.isRegister()) {
+                    try self.truncateRegister(src_ty, dst_mcv.getReg().?);
+                } else {
+                    const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp);
+                    const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                    defer self.register_manager.unlockReg(tmp_lock);
+
+                    const hi_mcv = dst_mcv.address().offset(@intCast(bit_size / 64 * 8)).deref();
+                    try self.genSetReg(tmp_reg, Type.usize, hi_mcv);
+                    try self.truncateRegister(src_ty, tmp_reg);
+                    try self.genCopy(Type.usize, hi_mcv, .{ .register = tmp_reg });
+                }
+            }
+        },
         else => unreachable,
     }
     return dst_mcv;
@@ -7054,7 +7447,7 @@ fn genUnOpMir(self: *Self, mir_tag: Mir.Inst.FixedTag, dst_ty: Type, dst_mcv: MC
         },
         .indirect, .load_frame => try self.asmMemory(
             mir_tag,
-            dst_mcv.mem(Memory.Size.fromSize(abi_size)),
+            try dst_mcv.mem(self, Memory.Size.fromSize(abi_size)),
         ),
     }
 }
@@ -7552,27 +7945,27 @@ fn genMulDivBinOp(
         defer self.register_manager.unlockReg(tmp_lock);
 
         if (mat_lhs_mcv.isMemory())
-            try self.asmRegisterMemory(.{ ._, .mov }, .rax, mat_lhs_mcv.mem(.qword))
+            try self.asmRegisterMemory(.{ ._, .mov }, .rax, try mat_lhs_mcv.mem(self, .qword))
         else
             try self.asmRegisterRegister(.{ ._, .mov }, .rax, mat_lhs_mcv.register_pair[0]);
         if (mat_rhs_mcv.isMemory()) try self.asmRegisterMemory(
             .{ ._, .mov },
             tmp_reg,
-            mat_rhs_mcv.address().offset(8).deref().mem(.qword),
+            try mat_rhs_mcv.address().offset(8).deref().mem(self, .qword),
         ) else try self.asmRegisterRegister(.{ ._, .mov }, tmp_reg, mat_rhs_mcv.register_pair[1]);
         try self.asmRegisterRegister(.{ .i_, .mul }, tmp_reg, .rax);
         if (mat_rhs_mcv.isMemory())
-            try self.asmMemory(.{ ._, .mul }, mat_rhs_mcv.mem(.qword))
+            try self.asmMemory(.{ ._, .mul }, try mat_rhs_mcv.mem(self, .qword))
         else
             try self.asmRegister(.{ ._, .mul }, mat_rhs_mcv.register_pair[0]);
         try self.asmRegisterRegister(.{ ._, .add }, .rdx, tmp_reg);
         if (mat_lhs_mcv.isMemory()) try self.asmRegisterMemory(
             .{ ._, .mov },
             tmp_reg,
-            mat_lhs_mcv.address().offset(8).deref().mem(.qword),
+            try mat_lhs_mcv.address().offset(8).deref().mem(self, .qword),
         ) else try self.asmRegisterRegister(.{ ._, .mov }, tmp_reg, mat_lhs_mcv.register_pair[1]);
         if (mat_rhs_mcv.isMemory())
-            try self.asmRegisterMemory(.{ .i_, .mul }, tmp_reg, mat_rhs_mcv.mem(.qword))
+            try self.asmRegisterMemory(.{ .i_, .mul }, tmp_reg, try mat_rhs_mcv.mem(self, .qword))
         else
             try self.asmRegisterRegister(.{ .i_, .mul }, tmp_reg, mat_rhs_mcv.register_pair[0]);
         try self.asmRegisterRegister(.{ ._, .add }, .rdx, tmp_reg);
@@ -7833,7 +8226,7 @@ fn genBinOp(
                                 .{ .vp_w, .insr },
                                 dst_reg,
                                 dst_reg,
-                                rhs_mcv.mem(.word),
+                                try rhs_mcv.mem(self, .word),
                                 Immediate.u(1),
                             ) else try self.asmRegisterRegisterRegister(
                                 .{ .vp_, .unpcklwd },
@@ -7858,7 +8251,7 @@ fn genBinOp(
                                 mir_tag,
                                 dst_reg,
                                 dst_reg,
-                                src_mcv.mem(Memory.Size.fromBitSize(float_bits)),
+                                try src_mcv.mem(self, Memory.Size.fromBitSize(float_bits)),
                             ) else try self.asmRegisterRegisterRegister(
                                 mir_tag,
                                 dst_reg,
@@ -7877,7 +8270,7 @@ fn genBinOp(
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 mir_tag,
                                 dst_reg,
-                                src_mcv.mem(Memory.Size.fromBitSize(float_bits)),
+                                try src_mcv.mem(self, Memory.Size.fromBitSize(float_bits)),
                             ) else try self.asmRegisterRegister(
                                 mir_tag,
                                 dst_reg,
@@ -7919,12 +8312,18 @@ fn genBinOp(
         };
     }
 
-    if ((lhs_ty.scalarType(mod).isRuntimeFloat() and
+    const sse_op = switch (lhs_ty.zigTypeTag(mod)) {
+        else => false,
+        .Float => true,
+        .Vector => switch (lhs_ty.childType(mod).toIntern()) {
+            .bool_type => false,
+            else => true,
+        },
+    };
+    if (sse_op and ((lhs_ty.scalarType(mod).isRuntimeFloat() and
         lhs_ty.scalarType(mod).floatBits(self.target.*) == 80) or
-        lhs_ty.abiSize(mod) > @as(u6, if (self.hasFeature(.avx)) 32 else 16))
-        return self.fail("TODO implement genBinOp for {s} {}", .{
-            @tagName(air_tag), lhs_ty.fmt(mod),
-        });
+        lhs_ty.abiSize(mod) > @as(u6, if (self.hasFeature(.avx)) 32 else 16)))
+        return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(air_tag), lhs_ty.fmt(mod) });
 
     const maybe_mask_reg = switch (air_tag) {
         else => null,
@@ -7941,10 +8340,16 @@ fn genBinOp(
         if (maybe_mask_reg) |mask_reg| self.register_manager.lockRegAssumeUnused(mask_reg) else null;
     defer if (mask_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const ordered_air = if (lhs_ty.isVector(mod) and lhs_ty.childType(mod).isAbiInt(mod) and
-        switch (air_tag) {
-        .cmp_lt, .cmp_gte => true,
-        else => false,
+    const ordered_air = if (lhs_ty.isVector(mod) and switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
+        .Int => switch (air_tag) {
+            .cmp_lt, .cmp_gte => true,
+            else => false,
+        },
+        .Float => switch (air_tag) {
+            .cmp_gte, .cmp_gt => true,
+            else => false,
+        },
+        else => unreachable,
     }) .{ .lhs = rhs_air, .rhs = lhs_air } else .{ .lhs = lhs_air, .rhs = rhs_air };
 
     const lhs_mcv = try self.resolveInst(ordered_air.lhs);
@@ -7971,14 +8376,12 @@ fn genBinOp(
         .xor,
         .min,
         .max,
+        .cmp_eq,
+        .cmp_neq,
         => true,
 
         else => false,
     };
-    const vec_op = switch (lhs_ty.zigTypeTag(mod)) {
-        else => false,
-        .Float, .Vector => true,
-    };
 
     const lhs_locks: [2]?RegisterLock = switch (lhs_mcv) {
         .register => |lhs_reg| .{ self.register_manager.lockRegAssumeUnused(lhs_reg), null },
@@ -8000,23 +8403,23 @@ fn genBinOp(
     var flipped = false;
     var copied_to_dst = true;
     const dst_mcv: MCValue = dst: {
+        const tracked_inst = switch (air_tag) {
+            else => maybe_inst,
+            .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => null,
+        };
         if (maybe_inst) |inst| {
-            const tracked_inst = switch (air_tag) {
-                else => inst,
-                .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => null,
-            };
-            if ((!vec_op or lhs_mcv.isRegister()) and
+            if ((!sse_op or lhs_mcv.isRegister()) and
                 self.reuseOperandAdvanced(inst, ordered_air.lhs, 0, lhs_mcv, tracked_inst))
                 break :dst lhs_mcv;
-            if (is_commutative and (!vec_op or rhs_mcv.isRegister()) and
+            if (is_commutative and (!sse_op or rhs_mcv.isRegister()) and
                 self.reuseOperandAdvanced(inst, ordered_air.rhs, 1, rhs_mcv, tracked_inst))
             {
                 flipped = true;
                 break :dst rhs_mcv;
             }
         }
-        const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, maybe_inst, true);
-        if (vec_op and lhs_mcv.isRegister() and self.hasFeature(.avx))
+        const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, tracked_inst, true);
+        if (sse_op and lhs_mcv.isRegister() and self.hasFeature(.avx))
             copied_to_dst = false
         else
             try self.genCopy(lhs_ty, dst_mcv, lhs_mcv);
@@ -8046,7 +8449,7 @@ fn genBinOp(
     };
     defer for (src_locks) |src_lock| if (src_lock) |lock| self.register_manager.unlockReg(lock);
 
-    if (!vec_op) {
+    if (!sse_op) {
         switch (air_tag) {
             .add,
             .add_wrap,
@@ -8130,17 +8533,25 @@ fn genBinOp(
 
                     try self.asmRegisterRegister(.{ ._, .mov }, tmp_reg, dst_regs[1]);
                     if (src_mcv.isMemory()) {
-                        try self.asmRegisterMemory(.{ ._, .cmp }, dst_regs[0], src_mcv.mem(.qword));
+                        try self.asmRegisterMemory(
+                            .{ ._, .cmp },
+                            dst_regs[0],
+                            try src_mcv.mem(self, .qword),
+                        );
                         try self.asmRegisterMemory(
                             .{ ._, .sbb },
                             tmp_reg,
-                            src_mcv.address().offset(8).deref().mem(.qword),
+                            try src_mcv.address().offset(8).deref().mem(self, .qword),
+                        );
+                        try self.asmCmovccRegisterMemory(
+                            cc,
+                            dst_regs[0],
+                            try src_mcv.mem(self, .qword),
                         );
-                        try self.asmCmovccRegisterMemory(cc, dst_regs[0], src_mcv.mem(.qword));
                         try self.asmCmovccRegisterMemory(
                             cc,
                             dst_regs[1],
-                            src_mcv.address().offset(8).deref().mem(.qword),
+                            try src_mcv.address().offset(8).deref().mem(self, .qword),
                         );
                     } else {
                         try self.asmRegisterRegister(
@@ -8292,7 +8703,7 @@ fn genBinOp(
                     .{ .vp_w, .insr },
                     dst_reg,
                     dst_reg,
-                    src_mcv.mem(.word),
+                    try src_mcv.mem(self, .word),
                     Immediate.u(1),
                 ) else try self.asmRegisterRegisterRegister(
                     .{ .vp_, .unpcklwd },
@@ -8738,7 +9149,7 @@ fn genBinOp(
                                 .{ .vp_w, .insr },
                                 dst_reg,
                                 dst_reg,
-                                src_mcv.mem(.word),
+                                try src_mcv.mem(self, .word),
                                 Immediate.u(1),
                             ) else try self.asmRegisterRegisterRegister(
                                 .{ .vp_, .unpcklwd },
@@ -8784,7 +9195,7 @@ fn genBinOp(
                             if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
                                 .{ .vp_d, .insr },
                                 dst_reg,
-                                src_mcv.mem(.dword),
+                                try src_mcv.mem(self, .dword),
                                 Immediate.u(1),
                             ) else try self.asmRegisterRegisterRegister(
                                 .{ .v_ps, .unpckl },
@@ -8836,7 +9247,7 @@ fn genBinOp(
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 .{ .v_ps, .cvtph2 },
                                 tmp_reg,
-                                src_mcv.mem(.qword),
+                                try src_mcv.mem(self, .qword),
                             ) else try self.asmRegisterRegister(
                                 .{ .v_ps, .cvtph2 },
                                 tmp_reg,
@@ -8879,7 +9290,7 @@ fn genBinOp(
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 .{ .v_ps, .cvtph2 },
                                 tmp_reg,
-                                src_mcv.mem(.xword),
+                                try src_mcv.mem(self, .xword),
                             ) else try self.asmRegisterRegister(
                                 .{ .v_ps, .cvtph2 },
                                 tmp_reg,
@@ -8925,6 +9336,13 @@ fn genBinOp(
                         => if (self.hasFeature(.avx)) .{ .v_ss, .div } else .{ ._ss, .div },
                         .max => if (self.hasFeature(.avx)) .{ .v_ss, .max } else .{ ._ss, .max },
                         .min => if (self.hasFeature(.avx)) .{ .v_ss, .min } else .{ ._ss, .min },
+                        .cmp_lt,
+                        .cmp_lte,
+                        .cmp_eq,
+                        .cmp_gte,
+                        .cmp_gt,
+                        .cmp_neq,
+                        => if (self.hasFeature(.avx)) .{ .v_ss, .cmp } else .{ ._ss, .cmp },
                         else => unreachable,
                     },
                     2...4 => switch (air_tag) {
@@ -8938,7 +9356,14 @@ fn genBinOp(
                         => if (self.hasFeature(.avx)) .{ .v_ps, .div } else .{ ._ps, .div },
                         .max => if (self.hasFeature(.avx)) .{ .v_ps, .max } else .{ ._ps, .max },
                         .min => if (self.hasFeature(.avx)) .{ .v_ps, .min } else .{ ._ps, .min },
-                        else => unreachable,
+                        .cmp_lt,
+                        .cmp_lte,
+                        .cmp_eq,
+                        .cmp_gte,
+                        .cmp_gt,
+                        .cmp_neq,
+                        => if (self.hasFeature(.avx)) .{ .v_ps, .cmp } else .{ ._ps, .cmp },
+                        else => unreachable,
                     },
                     5...8 => if (self.hasFeature(.avx)) switch (air_tag) {
                         .add => .{ .v_ps, .add },
@@ -8947,6 +9372,7 @@ fn genBinOp(
                         .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ps, .div },
                         .max => .{ .v_ps, .max },
                         .min => .{ .v_ps, .min },
+                        .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => .{ .v_ps, .cmp },
                         else => unreachable,
                     } else null,
                     else => null,
@@ -8963,6 +9389,13 @@ fn genBinOp(
                         => if (self.hasFeature(.avx)) .{ .v_sd, .div } else .{ ._sd, .div },
                         .max => if (self.hasFeature(.avx)) .{ .v_sd, .max } else .{ ._sd, .max },
                         .min => if (self.hasFeature(.avx)) .{ .v_sd, .min } else .{ ._sd, .min },
+                        .cmp_lt,
+                        .cmp_lte,
+                        .cmp_eq,
+                        .cmp_gte,
+                        .cmp_gt,
+                        .cmp_neq,
+                        => if (self.hasFeature(.avx)) .{ .v_sd, .cmp } else .{ ._sd, .cmp },
                         else => unreachable,
                     },
                     2 => switch (air_tag) {
@@ -8976,6 +9409,13 @@ fn genBinOp(
                         => if (self.hasFeature(.avx)) .{ .v_pd, .div } else .{ ._pd, .div },
                         .max => if (self.hasFeature(.avx)) .{ .v_pd, .max } else .{ ._pd, .max },
                         .min => if (self.hasFeature(.avx)) .{ .v_pd, .min } else .{ ._pd, .min },
+                        .cmp_lt,
+                        .cmp_lte,
+                        .cmp_eq,
+                        .cmp_gte,
+                        .cmp_gt,
+                        .cmp_neq,
+                        => if (self.hasFeature(.avx)) .{ .v_pd, .cmp } else .{ ._pd, .cmp },
                         else => unreachable,
                     },
                     3...4 => if (self.hasFeature(.avx)) switch (air_tag) {
@@ -8984,6 +9424,7 @@ fn genBinOp(
                         .mul => .{ .v_pd, .mul },
                         .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_pd, .div },
                         .max => .{ .v_pd, .max },
+                        .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => .{ .v_pd, .cmp },
                         .min => .{ .v_pd, .min },
                         else => unreachable,
                     } else null,
@@ -9004,43 +9445,96 @@ fn genBinOp(
     const lhs_copy_lock = if (lhs_copy_reg) |reg| self.register_manager.lockReg(reg) else null;
     defer if (lhs_copy_lock) |lock| self.register_manager.unlockReg(lock);
 
-    if (self.hasFeature(.avx)) {
-        const lhs_reg =
-            if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
-        if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
-            mir_tag,
-            dst_reg,
-            lhs_reg,
-            src_mcv.mem(switch (lhs_ty.zigTypeTag(mod)) {
-                else => Memory.Size.fromSize(abi_size),
-                .Vector => Memory.Size.fromBitSize(dst_reg.bitSize()),
-            }),
-        ) else try self.asmRegisterRegisterRegister(
-            mir_tag,
-            dst_reg,
-            lhs_reg,
-            registerAlias(if (src_mcv.isRegister())
-                src_mcv.getReg().?
-            else
-                try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
-        );
-    } else {
-        assert(copied_to_dst);
-        if (src_mcv.isMemory()) try self.asmRegisterMemory(
-            mir_tag,
-            dst_reg,
-            src_mcv.mem(switch (lhs_ty.zigTypeTag(mod)) {
-                else => Memory.Size.fromSize(abi_size),
-                .Vector => Memory.Size.fromBitSize(dst_reg.bitSize()),
-            }),
-        ) else try self.asmRegisterRegister(
-            mir_tag,
-            dst_reg,
-            registerAlias(if (src_mcv.isRegister())
-                src_mcv.getReg().?
-            else
-                try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
-        );
+    switch (mir_tag[1]) {
+        else => if (self.hasFeature(.avx)) {
+            const lhs_reg =
+                if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
+            if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+                mir_tag,
+                dst_reg,
+                lhs_reg,
+                try src_mcv.mem(self, switch (lhs_ty.zigTypeTag(mod)) {
+                    else => Memory.Size.fromSize(abi_size),
+                    .Vector => Memory.Size.fromBitSize(dst_reg.bitSize()),
+                }),
+            ) else try self.asmRegisterRegisterRegister(
+                mir_tag,
+                dst_reg,
+                lhs_reg,
+                registerAlias(if (src_mcv.isRegister())
+                    src_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
+            );
+        } else {
+            assert(copied_to_dst);
+            if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                mir_tag,
+                dst_reg,
+                try src_mcv.mem(self, switch (lhs_ty.zigTypeTag(mod)) {
+                    else => Memory.Size.fromSize(abi_size),
+                    .Vector => Memory.Size.fromBitSize(dst_reg.bitSize()),
+                }),
+            ) else try self.asmRegisterRegister(
+                mir_tag,
+                dst_reg,
+                registerAlias(if (src_mcv.isRegister())
+                    src_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
+            );
+        },
+        .cmp => {
+            const imm = Immediate.u(switch (air_tag) {
+                .cmp_eq => 0,
+                .cmp_lt, .cmp_gt => 1,
+                .cmp_lte, .cmp_gte => 2,
+                .cmp_neq => 4,
+                else => unreachable,
+            });
+            if (self.hasFeature(.avx)) {
+                const lhs_reg =
+                    if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
+                if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                    mir_tag,
+                    dst_reg,
+                    lhs_reg,
+                    try src_mcv.mem(self, switch (lhs_ty.zigTypeTag(mod)) {
+                        else => Memory.Size.fromSize(abi_size),
+                        .Vector => Memory.Size.fromBitSize(dst_reg.bitSize()),
+                    }),
+                    imm,
+                ) else try self.asmRegisterRegisterRegisterImmediate(
+                    mir_tag,
+                    dst_reg,
+                    lhs_reg,
+                    registerAlias(if (src_mcv.isRegister())
+                        src_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
+                    imm,
+                );
+            } else {
+                assert(copied_to_dst);
+                if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                    mir_tag,
+                    dst_reg,
+                    try src_mcv.mem(self, switch (lhs_ty.zigTypeTag(mod)) {
+                        else => Memory.Size.fromSize(abi_size),
+                        .Vector => Memory.Size.fromBitSize(dst_reg.bitSize()),
+                    }),
+                    imm,
+                ) else try self.asmRegisterRegisterImmediate(
+                    mir_tag,
+                    dst_reg,
+                    registerAlias(if (src_mcv.isRegister())
+                        src_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
+                    imm,
+                );
+            }
+        },
     }
 
     switch (air_tag) {
@@ -9281,48 +9775,46 @@ fn genBinOp(
                 );
             }
         },
-        .cmp_lt,
-        .cmp_lte,
-        .cmp_eq,
-        .cmp_gte,
-        .cmp_gt,
-        .cmp_neq,
-        => {
-            switch (air_tag) {
-                .cmp_lt,
-                .cmp_eq,
-                .cmp_gt,
-                => {},
-                .cmp_lte,
-                .cmp_gte,
-                .cmp_neq,
-                => {
-                    const unsigned_ty = try lhs_ty.toUnsigned(mod);
-                    const not_mcv = try self.genTypedValue(.{
-                        .ty = lhs_ty,
-                        .val = try unsigned_ty.maxInt(mod, unsigned_ty),
-                    });
-                    const not_mem: Memory = if (not_mcv.isMemory())
-                        not_mcv.mem(Memory.Size.fromSize(abi_size))
-                    else
-                        .{ .base = .{
-                            .reg = try self.copyToTmpRegister(Type.usize, not_mcv.address()),
-                        }, .mod = .{ .rm = .{ .size = Memory.Size.fromSize(abi_size) } } };
-                    switch (mir_tag[0]) {
-                        .vp_b, .vp_d, .vp_q, .vp_w => try self.asmRegisterRegisterMemory(
-                            .{ .vp_, .xor },
-                            dst_reg,
-                            dst_reg,
-                            not_mem,
-                        ),
-                        .p_b, .p_d, .p_q, .p_w => try self.asmRegisterMemory(
-                            .{ .p_, .xor },
-                            dst_reg,
-                            not_mem,
-                        ),
-                        else => unreachable,
-                    }
+        .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => {
+            switch (lhs_ty.childType(mod).zigTypeTag(mod)) {
+                .Int => switch (air_tag) {
+                    .cmp_lt,
+                    .cmp_eq,
+                    .cmp_gt,
+                    => {},
+                    .cmp_lte,
+                    .cmp_gte,
+                    .cmp_neq,
+                    => {
+                        const unsigned_ty = try lhs_ty.toUnsigned(mod);
+                        const not_mcv = try self.genTypedValue(.{
+                            .ty = lhs_ty,
+                            .val = try unsigned_ty.maxInt(mod, unsigned_ty),
+                        });
+                        const not_mem: Memory = if (not_mcv.isMemory())
+                            try not_mcv.mem(self, Memory.Size.fromSize(abi_size))
+                        else
+                            .{ .base = .{
+                                .reg = try self.copyToTmpRegister(Type.usize, not_mcv.address()),
+                            }, .mod = .{ .rm = .{ .size = Memory.Size.fromSize(abi_size) } } };
+                        switch (mir_tag[0]) {
+                            .vp_b, .vp_d, .vp_q, .vp_w => try self.asmRegisterRegisterMemory(
+                                .{ .vp_, .xor },
+                                dst_reg,
+                                dst_reg,
+                                not_mem,
+                            ),
+                            .p_b, .p_d, .p_q, .p_w => try self.asmRegisterMemory(
+                                .{ .p_, .xor },
+                                dst_reg,
+                                not_mem,
+                            ),
+                            else => unreachable,
+                        }
+                    },
+                    else => unreachable,
                 },
+                .Float => {},
                 else => unreachable,
             }
 
@@ -9331,8 +9823,12 @@ fn genBinOp(
             defer self.register_manager.unlockReg(gp_lock);
 
             try self.asmRegisterRegister(switch (mir_tag[0]) {
-                .vp_b, .vp_d, .vp_q, .vp_w => .{ .vp_b, .movmsk },
+                ._pd, ._sd => .{ ._pd, .movmsk },
+                ._ps, ._ss => .{ ._ps, .movmsk },
                 .p_b, .p_d, .p_q, .p_w => .{ .p_b, .movmsk },
+                .v_pd, .v_sd => .{ .v_pd, .movmsk },
+                .v_ps, .v_ss => .{ .v_ps, .movmsk },
+                .vp_b, .vp_d, .vp_q, .vp_w => .{ .vp_b, .movmsk },
                 else => unreachable,
             }, gp_reg.to32(), dst_reg);
             return .{ .register = gp_reg };
@@ -9459,13 +9955,13 @@ fn genBinOpMir(
                     .load_frame,
                     .lea_frame,
                     => {
-                        blk: {
-                            return self.asmRegisterMemory(mir_limb_tag, dst_alias, switch (src_mcv) {
+                        direct: {
+                            try self.asmRegisterMemory(mir_limb_tag, dst_alias, switch (src_mcv) {
                                 .memory => |addr| .{
                                     .base = .{ .reg = .ds },
                                     .mod = .{ .rm = .{
                                         .size = Memory.Size.fromSize(limb_abi_size),
-                                        .disp = math.cast(i32, addr + off) orelse break :blk,
+                                        .disp = math.cast(i32, addr + off) orelse break :direct,
                                     } },
                                 },
                                 .indirect => |reg_off| .{
@@ -9482,8 +9978,9 @@ fn genBinOpMir(
                                         .disp = frame_addr.off + off,
                                     } },
                                 },
-                                else => break :blk,
+                                else => break :direct,
                             });
+                            continue;
                         }
 
                         switch (src_mcv) {
@@ -10180,7 +10677,7 @@ fn genCall(self: *Self, info: union(enum) {
         .none, .unreach => {},
         .indirect => |reg_off| {
             const ret_ty = fn_info.return_type.toType();
-            const frame_index = try self.allocFrameIndex(FrameAlloc.initType(ret_ty, mod));
+            const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(ret_ty, mod));
             try self.genSetReg(reg_off.reg, Type.usize, .{
                 .lea_frame = .{ .index = frame_index, .off = -reg_off.off },
             });
@@ -10306,19 +10803,20 @@ fn genCall(self: *Self, info: union(enum) {
 fn airRet(self: *Self, inst: Air.Inst.Index) !void {
     const mod = self.bin_file.options.module.?;
     const un_op = self.air.instructions.items(.data)[inst].un_op;
-    const operand = try self.resolveInst(un_op);
 
     const ret_ty = self.fn_type.fnReturnType(mod);
     switch (self.ret_mcv.short) {
         .none => {},
-        .register, .register_pair => try self.genCopy(ret_ty, self.ret_mcv.short, operand),
+        .register,
+        .register_pair,
+        => try self.genCopy(ret_ty, self.ret_mcv.short, .{ .air_ref = un_op }),
         .indirect => |reg_off| {
             try self.register_manager.getReg(reg_off.reg, null);
             const lock = self.register_manager.lockRegAssumeUnused(reg_off.reg);
             defer self.register_manager.unlockReg(lock);
 
             try self.genSetReg(reg_off.reg, Type.usize, self.ret_mcv.long);
-            try self.genSetMem(.{ .reg = reg_off.reg }, reg_off.off, ret_ty, operand);
+            try self.genSetMem(.{ .reg = reg_off.reg }, reg_off.off, ret_ty, .{ .air_ref = un_op });
         },
         else => unreachable,
     }
@@ -10593,7 +11091,7 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
                                 const locks = self.register_manager.lockRegsAssumeUnused(2, regs);
                                 defer for (locks) |lock| self.register_manager.unlockReg(lock);
 
-                                const limbs_len = std.math.divCeil(u16, abi_size, 8) catch unreachable;
+                                const limbs_len = math.divCeil(u16, abi_size, 8) catch unreachable;
                                 var limb_i: u16 = 0;
                                 while (limb_i < limbs_len) : (limb_i += 1) {
                                     const off = limb_i * 8;
@@ -10688,7 +11186,7 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
                             .{ .vp_w, .insr },
                             tmp1_reg,
                             dst_reg.to128(),
-                            src_mcv.mem(.word),
+                            try src_mcv.mem(self, .word),
                             Immediate.u(1),
                         ) else try self.asmRegisterRegisterRegister(
                             .{ .vp_, .unpcklwd },
@@ -10892,8 +11390,8 @@ fn genCondBrMir(self: *Self, ty: Type, mcv: MCValue) !Mir.Inst.Index {
         },
         .register => |reg| {
             try self.spillEflagsIfOccupied();
-            try self.asmRegisterImmediate(.{ ._, .@"test" }, reg, Immediate.u(1));
-            return self.asmJccReloc(.e, undefined);
+            try self.asmRegisterImmediate(.{ ._, .@"test" }, reg.to8(), Immediate.u(1));
+            return self.asmJccReloc(.z, undefined);
         },
         .immediate,
         .load_frame,
@@ -11433,12 +11931,12 @@ fn airBr(self: *Self, inst: Air.Inst.Index) !void {
         if (self.reuseOperandAdvanced(inst, br.operand, 0, src_mcv, br.block_inst)) {
             if (first_br) break :result src_mcv;
 
-            if (block_tracking.getReg()) |block_reg|
+            for (block_tracking.getRegs()) |block_reg|
                 try self.register_manager.getReg(block_reg, br.block_inst);
             // .long = .none to avoid merging operand and block result stack frames.
             var current_tracking = InstTracking{ .long = .none, .short = src_mcv };
             try current_tracking.materializeUnsafe(self, br.block_inst, block_tracking.*);
-            if (src_mcv.getReg()) |src_reg| self.register_manager.freeReg(src_reg);
+            for (src_mcv.getRegs()) |src_reg| self.register_manager.freeReg(src_reg);
             break :result block_tracking.short;
         }
 
@@ -12177,16 +12675,87 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
         .general_purpose, .segment => return .{ .move = .{ ._, .mov } },
         .x87 => return .x87_load_store,
         .mmx => {},
-        .sse => {
-            switch (ty.zigTypeTag(mod)) {
-                else => {
-                    const classes = mem.sliceTo(&abi.classifySystemV(ty, mod, .other), .none);
-                    assert(std.mem.indexOfNone(abi.Class, classes, &.{
-                        .integer, .sse, .float, .float_combine,
-                    }) == null);
-                    const abi_size = ty.abiSize(mod);
-                    if (abi_size < 4 or
-                        std.mem.indexOfScalar(abi.Class, classes, .integer) != null) switch (abi_size) {
+        .sse => switch (ty.zigTypeTag(mod)) {
+            else => {
+                const classes = mem.sliceTo(&abi.classifySystemV(ty, mod, .other), .none);
+                assert(std.mem.indexOfNone(abi.Class, classes, &.{
+                    .integer, .sse, .float, .float_combine,
+                }) == null);
+                const abi_size = ty.abiSize(mod);
+                if (abi_size < 4 or
+                    std.mem.indexOfScalar(abi.Class, classes, .integer) != null) switch (abi_size) {
+                    1 => if (self.hasFeature(.avx)) return .{ .vex_insert_extract = .{
+                        .insert = .{ .vp_b, .insr },
+                        .extract = .{ .vp_b, .extr },
+                    } } else if (self.hasFeature(.sse4_2)) return .{ .insert_extract = .{
+                        .insert = .{ .p_b, .insr },
+                        .extract = .{ .p_b, .extr },
+                    } },
+                    2 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
+                        .insert = .{ .vp_w, .insr },
+                        .extract = .{ .vp_w, .extr },
+                    } } else .{ .insert_extract = .{
+                        .insert = .{ .p_w, .insr },
+                        .extract = .{ .p_w, .extr },
+                    } },
+                    3...4 => return .{ .move = if (self.hasFeature(.avx))
+                        .{ .v_d, .mov }
+                    else
+                        .{ ._d, .mov } },
+                    5...8 => return .{ .move = if (self.hasFeature(.avx))
+                        .{ .v_q, .mov }
+                    else
+                        .{ ._q, .mov } },
+                    9...16 => return .{ .move = if (self.hasFeature(.avx))
+                        if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                    else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                    17...32 => if (self.hasFeature(.avx))
+                        return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } },
+                    else => {},
+                } else switch (abi_size) {
+                    4 => return .{ .move = if (self.hasFeature(.avx))
+                        .{ .v_ss, .mov }
+                    else
+                        .{ ._ss, .mov } },
+                    5...8 => return .{ .move = if (self.hasFeature(.avx))
+                        .{ .v_sd, .mov }
+                    else
+                        .{ ._sd, .mov } },
+                    9...16 => return .{ .move = if (self.hasFeature(.avx))
+                        if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu }
+                    else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } },
+                    17...32 => if (self.hasFeature(.avx)) return .{ .move = if (aligned)
+                        .{ .v_pd, .mova }
+                    else
+                        .{ .v_pd, .movu } },
+                    else => {},
+                }
+            },
+            .Float => switch (ty.floatBits(self.target.*)) {
+                16 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
+                    .insert = .{ .vp_w, .insr },
+                    .extract = .{ .vp_w, .extr },
+                } } else .{ .insert_extract = .{
+                    .insert = .{ .p_w, .insr },
+                    .extract = .{ .p_w, .extr },
+                } },
+                32 => return .{ .move = if (self.hasFeature(.avx))
+                    .{ .v_ss, .mov }
+                else
+                    .{ ._ss, .mov } },
+                64 => return .{ .move = if (self.hasFeature(.avx))
+                    .{ .v_sd, .mov }
+                else
+                    .{ ._sd, .mov } },
+                128 => return .{ .move = if (self.hasFeature(.avx))
+                    if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                else => {},
+            },
+            .Vector => switch (ty.childType(mod).zigTypeTag(mod)) {
+                .Bool => {},
+                .Int => switch (ty.childType(mod).intInfo(mod).bits) {
+                    8 => switch (ty.vectorLen(mod)) {
                         1 => if (self.hasFeature(.avx)) return .{ .vex_insert_extract = .{
                             .insert = .{ .vp_b, .insr },
                             .extract = .{ .vp_b, .extr },
@@ -12213,242 +12782,169 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo
                             if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
                         else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
                         17...32 => if (self.hasFeature(.avx))
-                            return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } },
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
+                            else
+                                .{ .v_, .movdqu } },
                         else => {},
-                    } else switch (abi_size) {
-                        4 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_ss, .mov }
+                    },
+                    16 => switch (ty.vectorLen(mod)) {
+                        1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
+                            .insert = .{ .vp_w, .insr },
+                            .extract = .{ .vp_w, .extr },
+                        } } else .{ .insert_extract = .{
+                            .insert = .{ .p_w, .insr },
+                            .extract = .{ .p_w, .extr },
+                        } },
+                        2 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_d, .mov }
                         else
-                            .{ ._ss, .mov } },
+                            .{ ._d, .mov } },
+                        3...4 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_q, .mov }
+                        else
+                            .{ ._q, .mov } },
                         5...8 => return .{ .move = if (self.hasFeature(.avx))
-                            .{ .v_sd, .mov }
+                            if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                        else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                        9...16 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
+                            else
+                                .{ .v_, .movdqu } },
+                        else => {},
+                    },
+                    32 => switch (ty.vectorLen(mod)) {
+                        1 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_d, .mov }
                         else
-                            .{ ._sd, .mov } },
-                        9...16 => return .{ .move = if (self.hasFeature(.avx))
-                            if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu }
-                        else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } },
-                        17...32 => if (self.hasFeature(.avx)) return .{ .move = if (aligned)
-                            .{ .v_pd, .mova }
+                            .{ ._d, .mov } },
+                        2 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_q, .mov }
                         else
-                            .{ .v_pd, .movu } },
-                        else => {},
-                    }
-                },
-                .Float => switch (ty.floatBits(self.target.*)) {
-                    16 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                        .insert = .{ .vp_w, .insr },
-                        .extract = .{ .vp_w, .extr },
-                    } } else .{ .insert_extract = .{
-                        .insert = .{ .p_w, .insr },
-                        .extract = .{ .p_w, .extr },
-                    } },
-                    32 => return .{ .move = if (self.hasFeature(.avx))
-                        .{ .v_ss, .mov }
-                    else
-                        .{ ._ss, .mov } },
-                    64 => return .{ .move = if (self.hasFeature(.avx))
-                        .{ .v_sd, .mov }
-                    else
-                        .{ ._sd, .mov } },
-                    128 => return .{ .move = if (self.hasFeature(.avx))
-                        if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                    else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                    else => {},
-                },
-                .Vector => switch (ty.childType(mod).zigTypeTag(mod)) {
-                    .Bool => return .{ .move = .{ ._, .mov } },
-                    .Int => switch (ty.childType(mod).intInfo(mod).bits) {
-                        8 => switch (ty.vectorLen(mod)) {
-                            1 => if (self.hasFeature(.avx)) return .{ .vex_insert_extract = .{
-                                .insert = .{ .vp_b, .insr },
-                                .extract = .{ .vp_b, .extr },
-                            } } else if (self.hasFeature(.sse4_2)) return .{ .insert_extract = .{
-                                .insert = .{ .p_b, .insr },
-                                .extract = .{ .p_b, .extr },
-                            } },
-                            2 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                                .insert = .{ .vp_w, .insr },
-                                .extract = .{ .vp_w, .extr },
-                            } } else .{ .insert_extract = .{
-                                .insert = .{ .p_w, .insr },
-                                .extract = .{ .p_w, .extr },
-                            } },
-                            3...4 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_d, .mov }
-                            else
-                                .{ ._d, .mov } },
-                            5...8 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_q, .mov }
-                            else
-                                .{ ._q, .mov } },
-                            9...16 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            17...32 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
-                        16 => switch (ty.vectorLen(mod)) {
-                            1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                                .insert = .{ .vp_w, .insr },
-                                .extract = .{ .vp_w, .extr },
-                            } } else .{ .insert_extract = .{
-                                .insert = .{ .p_w, .insr },
-                                .extract = .{ .p_w, .extr },
-                            } },
-                            2 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_d, .mov }
-                            else
-                                .{ ._d, .mov } },
-                            3...4 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_q, .mov }
-                            else
-                                .{ ._q, .mov } },
-                            5...8 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            9...16 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
-                        32 => switch (ty.vectorLen(mod)) {
-                            1 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_d, .mov }
+                            .{ ._q, .mov } },
+                        3...4 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                        else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                        5...8 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
                             else
-                                .{ ._d, .mov } },
-                            2 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_q, .mov }
+                                .{ .v_, .movdqu } },
+                        else => {},
+                    },
+                    64 => switch (ty.vectorLen(mod)) {
+                        1 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_q, .mov }
+                        else
+                            .{ ._q, .mov } },
+                        2 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                        else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                        3...4 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
                             else
-                                .{ ._q, .mov } },
-                            3...4 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            5...8 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
-                        64 => switch (ty.vectorLen(mod)) {
-                            1 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_q, .mov }
+                                .{ .v_, .movdqu } },
+                        else => {},
+                    },
+                    128 => switch (ty.vectorLen(mod)) {
+                        1 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                        else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                        2 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
                             else
-                                .{ ._q, .mov } },
-                            2 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            3...4 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
-                        128 => switch (ty.vectorLen(mod)) {
-                            1 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            2 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
-                        256 => switch (ty.vectorLen(mod)) {
-                            1 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
+                                .{ .v_, .movdqu } },
                         else => {},
                     },
-                    .Float => switch (ty.childType(mod).floatBits(self.target.*)) {
-                        16 => switch (ty.vectorLen(mod)) {
-                            1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
-                                .insert = .{ .vp_w, .insr },
-                                .extract = .{ .vp_w, .extr },
-                            } } else .{ .insert_extract = .{
-                                .insert = .{ .p_w, .insr },
-                                .extract = .{ .p_w, .extr },
-                            } },
-                            2 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_d, .mov }
+                    256 => switch (ty.vectorLen(mod)) {
+                        1 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
                             else
-                                .{ ._d, .mov } },
-                            3...4 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_q, .mov }
+                                .{ .v_, .movdqu } },
+                        else => {},
+                    },
+                    else => {},
+                },
+                .Float => switch (ty.childType(mod).floatBits(self.target.*)) {
+                    16 => switch (ty.vectorLen(mod)) {
+                        1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{
+                            .insert = .{ .vp_w, .insr },
+                            .extract = .{ .vp_w, .extr },
+                        } } else .{ .insert_extract = .{
+                            .insert = .{ .p_w, .insr },
+                            .extract = .{ .p_w, .extr },
+                        } },
+                        2 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_d, .mov }
+                        else
+                            .{ ._d, .mov } },
+                        3...4 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_q, .mov }
+                        else
+                            .{ ._q, .mov } },
+                        5...8 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                        else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                        9...16 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
                             else
-                                .{ ._q, .mov } },
-                            5...8 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            9...16 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
-                        32 => switch (ty.vectorLen(mod)) {
-                            1 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_ss, .mov }
+                                .{ .v_, .movdqu } },
+                        else => {},
+                    },
+                    32 => switch (ty.vectorLen(mod)) {
+                        1 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_ss, .mov }
+                        else
+                            .{ ._ss, .mov } },
+                        2 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_sd, .mov }
+                        else
+                            .{ ._sd, .mov } },
+                        3...4 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
+                        else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } },
+                        5...8 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_ps, .mova }
                             else
-                                .{ ._ss, .mov } },
-                            2 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_sd, .mov }
+                                .{ .v_ps, .movu } },
+                        else => {},
+                    },
+                    64 => switch (ty.vectorLen(mod)) {
+                        1 => return .{ .move = if (self.hasFeature(.avx))
+                            .{ .v_sd, .mov }
+                        else
+                            .{ ._sd, .mov } },
+                        2 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu }
+                        else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } },
+                        3...4 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_pd, .mova }
                             else
-                                .{ ._sd, .mov } },
-                            3...4 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
-                            else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } },
-                            5...8 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_ps, .mova }
-                                else
-                                    .{ .v_ps, .movu } },
-                            else => {},
-                        },
-                        64 => switch (ty.vectorLen(mod)) {
-                            1 => return .{ .move = if (self.hasFeature(.avx))
-                                .{ .v_sd, .mov }
+                                .{ .v_pd, .movu } },
+                        else => {},
+                    },
+                    128 => switch (ty.vectorLen(mod)) {
+                        1 => return .{ .move = if (self.hasFeature(.avx))
+                            if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
+                        else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
+                        2 => if (self.hasFeature(.avx))
+                            return .{ .move = if (aligned)
+                                .{ .v_, .movdqa }
                             else
-                                .{ ._sd, .mov } },
-                            2 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu }
-                            else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } },
-                            3...4 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_pd, .mova }
-                                else
-                                    .{ .v_pd, .movu } },
-                            else => {},
-                        },
-                        128 => switch (ty.vectorLen(mod)) {
-                            1 => return .{ .move = if (self.hasFeature(.avx))
-                                if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu }
-                            else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } },
-                            2 => if (self.hasFeature(.avx))
-                                return .{ .move = if (aligned)
-                                    .{ .v_, .movdqa }
-                                else
-                                    .{ .v_, .movdqu } },
-                            else => {},
-                        },
+                                .{ .v_, .movdqu } },
                         else => {},
                     },
                     else => {},
                 },
-            }
+                else => {},
+            },
         },
     }
     return self.fail("TODO moveStrategy for {}", .{ty.fmt(mod)});
@@ -12514,32 +13010,18 @@ fn genCopy(self: *Self, ty: Type, dst_mcv: MCValue, src_mcv: MCValue) InnerError
             };
             defer if (src_info) |info| self.register_manager.unlockReg(info.addr_lock);
 
-            const classes = mem.sliceTo(&abi.classifySystemV(ty, mod, .other), .none);
-            for (dst_regs, classes, 0..) |dst_reg, class, dst_reg_i| {
-                const class_ty = switch (class) {
-                    .integer => Type.usize,
-                    .sse, .float, .float_combine => Type.f64,
-                    else => unreachable,
-                };
-                const off: i32 = @intCast(dst_reg_i * 8);
-                switch (src_mcv) {
-                    .register_pair => |src_regs| try self.genSetReg(
-                        dst_reg,
-                        class_ty,
-                        .{ .register = src_regs[dst_reg_i] },
-                    ),
-                    .memory, .indirect, .load_frame => try self.genSetReg(
-                        dst_reg,
-                        class_ty,
-                        src_mcv.address().offset(off).deref(),
-                    ),
-                    .load_symbol, .load_direct, .load_got, .load_tlv => try self.genSetReg(
-                        dst_reg,
-                        class_ty,
-                        .{ .indirect = .{ .reg = src_info.?.addr_reg, .off = off } },
-                    ),
+            var part_disp: i32 = 0;
+            for (dst_regs, try self.splitType(ty), 0..) |dst_reg, dst_ty, part_i| {
+                try self.genSetReg(dst_reg, dst_ty, switch (src_mcv) {
+                    .register_pair => |src_regs| .{ .register = src_regs[part_i] },
+                    .memory, .indirect, .load_frame => src_mcv.address().offset(part_disp).deref(),
+                    .load_symbol, .load_direct, .load_got, .load_tlv => .{ .indirect = .{
+                        .reg = src_info.?.addr_reg,
+                        .off = part_disp,
+                    } },
                     else => unreachable,
-                }
+                });
+                part_disp += @intCast(dst_ty.abiSize(mod));
             }
         },
         .indirect => |reg_off| try self.genSetMem(.{ .reg = reg_off.reg }, reg_off.off, ty, src_mcv),
@@ -12584,6 +13066,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
             if (imm == 0) {
                 // 32-bit moves zero-extend to 64-bit, so xoring the 32-bit
                 // register is the fastest way to zero a register.
+                try self.spillEflagsIfOccupied();
                 try self.asmRegisterRegister(.{ ._, .xor }, dst_reg.to32(), dst_reg.to32());
             } else if (abi_size > 4 and math.cast(u32, imm) != null) {
                 // 32-bit moves zero-extend to 64-bit.
@@ -12933,44 +13416,65 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal
         .eflags => |cc| try self.asmSetccMemory(cc, .{ .base = base, .mod = .{
             .rm = .{ .size = .byte, .disp = disp },
         } }),
-        .register => |src_reg| try (try self.moveStrategy(ty, src_reg.class(), switch (base) {
-            .none => ty.abiAlignment(mod).check(@as(u32, @bitCast(disp))),
-            .reg => |reg| switch (reg) {
-                .es, .cs, .ss, .ds => ty.abiAlignment(mod).check(@as(u32, @bitCast(disp))),
-                else => false,
-            },
-            .frame => |frame_index| self.getFrameAddrAlignment(
-                .{ .index = frame_index, .off = disp },
-            ).compare(.gte, ty.abiAlignment(mod)),
-            .reloc => false,
-        })).write(
-            self,
-            .{ .base = base, .mod = .{ .rm = .{
-                .size = self.memSize(ty),
-                .disp = disp,
-            } } },
-            registerAlias(src_reg, abi_size),
-        ),
-        .register_pair => |src_regs| for (src_regs, 0..) |src_reg, src_reg_i| {
-            const part_size: u16 = @min(abi_size - src_reg_i * 8, 8);
-            try (try self.moveStrategy(
-                try mod.intType(.unsigned, part_size * 8),
-                src_reg.class(),
-                switch (base) {
-                    .none => ty.abiAlignment(mod).check(@as(u32, @bitCast(disp))),
-                    .reg => |reg| switch (reg) {
-                        .es, .cs, .ss, .ds => ty.abiAlignment(mod).check(@as(u32, @bitCast(disp))),
-                        else => false,
-                    },
-                    .frame => |frame_index| self.getFrameAddrAlignment(
-                        .{ .index = frame_index, .off = disp },
-                    ).compare(.gte, ty.abiAlignment(mod)),
-                    .reloc => false,
+        .register => |src_reg| {
+            const mem_size = switch (base) {
+                .frame => |base_fi| mem_size: {
+                    assert(disp >= 0);
+                    const frame_abi_size = self.frame_allocs.items(.abi_size)[@intFromEnum(base_fi)];
+                    const frame_spill_pad = self.frame_allocs.items(.spill_pad)[@intFromEnum(base_fi)];
+                    assert(frame_abi_size - frame_spill_pad - disp >= abi_size);
+                    break :mem_size if (frame_abi_size - frame_spill_pad - disp == abi_size)
+                        frame_abi_size
+                    else
+                        abi_size;
+                },
+                else => abi_size,
+            };
+            const src_alias = registerAlias(src_reg, abi_size);
+            const src_size: u32 = @intCast(switch (src_alias.class()) {
+                .general_purpose, .segment, .x87 => @divExact(src_alias.bitSize(), 8),
+                .mmx, .sse => abi_size,
+            });
+            if (src_size > mem_size) {
+                const frame_index = try self.allocFrameIndex(FrameAlloc.init(.{
+                    .size = src_size,
+                    .alignment = Alignment.fromNonzeroByteUnits(src_size),
+                }));
+                const frame_mcv: MCValue = .{ .load_frame = .{ .index = frame_index } };
+                try (try self.moveStrategy(ty, src_alias.class(), true)).write(
+                    self,
+                    .{ .base = .{ .frame = frame_index }, .mod = .{ .rm = .{
+                        .size = Memory.Size.fromSize(src_size),
+                    } } },
+                    src_alias,
+                );
+                try self.genSetMem(base, disp, ty, frame_mcv);
+                try self.freeValue(frame_mcv);
+            } else try (try self.moveStrategy(ty, src_alias.class(), switch (base) {
+                .none => ty.abiAlignment(mod).check(@as(u32, @bitCast(disp))),
+                .reg => |reg| switch (reg) {
+                    .es, .cs, .ss, .ds => ty.abiAlignment(mod).check(@as(u32, @bitCast(disp))),
+                    else => false,
                 },
-            )).write(self, .{ .base = base, .mod = .{ .rm = .{
-                .size = Memory.Size.fromSize(part_size),
-                .disp = disp + @as(i32, @intCast(src_reg_i * 8)),
-            } } }, registerAlias(src_reg, part_size));
+                .frame => |frame_index| self.getFrameAddrAlignment(
+                    .{ .index = frame_index, .off = disp },
+                ).compare(.gte, ty.abiAlignment(mod)),
+                .reloc => false,
+            })).write(
+                self,
+                .{ .base = base, .mod = .{ .rm = .{
+                    .size = self.memSize(ty),
+                    .disp = disp,
+                } } },
+                src_alias,
+            );
+        },
+        .register_pair => |src_regs| {
+            var part_disp: i32 = disp;
+            for (try self.splitType(ty), src_regs) |src_ty, src_reg| {
+                try self.genSetMem(base, part_disp, src_ty, .{ .register = src_reg });
+                part_disp += @intCast(src_ty.abiSize(mod));
+            }
         },
         .register_overflow => |ro| switch (ty.zigTypeTag(mod)) {
             .Struct => {
@@ -13226,50 +13730,43 @@ fn airBitCast(self: *Self, inst: Air.Inst.Index) !void {
         const src_lock = if (src_mcv.getReg()) |reg| self.register_manager.lockReg(reg) else null;
         defer if (src_lock) |lock| self.register_manager.unlockReg(lock);
 
-        const dst_mcv = if (dst_rc.supersetOf(src_rc) and
-            self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
-            src_mcv
-        else dst: {
+        const dst_mcv = if (dst_rc.supersetOf(src_rc) and dst_ty.abiSize(mod) <= src_ty.abiSize(mod) and
+            self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) src_mcv else dst: {
             const dst_mcv = try self.allocRegOrMem(inst, true);
-            try self.genCopy(
-                if (!dst_mcv.isMemory() or src_mcv.isMemory()) dst_ty else src_ty,
-                dst_mcv,
-                src_mcv,
-            );
+            try self.genCopy(switch (math.order(dst_ty.abiSize(mod), src_ty.abiSize(mod))) {
+                .lt => dst_ty,
+                .eq => if (!dst_mcv.isMemory() or src_mcv.isMemory()) dst_ty else src_ty,
+                .gt => src_ty,
+            }, dst_mcv, src_mcv);
             break :dst dst_mcv;
         };
 
         if (dst_ty.isRuntimeFloat()) break :result dst_mcv;
 
-        const dst_signedness =
-            if (dst_ty.isAbiInt(mod)) dst_ty.intInfo(mod).signedness else .unsigned;
-        if (!src_ty.isRuntimeFloat() or src_ty.floatBits(self.target.*) != 80) {
-            const src_signedness =
-                if (src_ty.isAbiInt(mod)) src_ty.intInfo(mod).signedness else .unsigned;
-            if (dst_signedness == src_signedness) break :result dst_mcv;
-        }
+        if (dst_ty.isAbiInt(mod) and src_ty.isAbiInt(mod) and
+            dst_ty.intInfo(mod).signedness == src_ty.intInfo(mod).signedness) break :result dst_mcv;
 
-        const abi_size: u16 = @intCast(dst_ty.abiSize(mod));
-        const bit_size: u16 = @intCast(dst_ty.bitSize(mod));
-        if (abi_size * 8 <= bit_size) break :result dst_mcv;
+        const abi_size = dst_ty.abiSize(mod);
+        const bit_size = dst_ty.bitSize(mod);
+        if (abi_size * 8 <= bit_size or dst_ty.isVector(mod)) break :result dst_mcv;
 
-        const dst_limbs_len = math.divCeil(i32, bit_size, 64) catch unreachable;
-        const high_reg = if (dst_mcv.isRegister())
-            dst_mcv.getReg().?
+        const dst_limbs_len = math.divCeil(i32, @intCast(bit_size), 64) catch unreachable;
+        const high_mcv: MCValue = switch (dst_mcv) {
+            .register => |dst_reg| .{ .register = dst_reg },
+            .register_pair => |dst_regs| .{ .register = dst_regs[1] },
+            else => dst_mcv.address().offset((dst_limbs_len - 1) * 8).deref(),
+        };
+        const high_reg = if (high_mcv.isRegister())
+            high_mcv.getReg().?
         else
-            try self.copyToTmpRegister(
-                Type.usize,
-                dst_mcv.address().offset((dst_limbs_len - 1) * 8).deref(),
-            );
+            try self.copyToTmpRegister(Type.usize, high_mcv);
         const high_lock = self.register_manager.lockReg(high_reg);
         defer if (high_lock) |lock| self.register_manager.unlockReg(lock);
 
-        const high_ty = try mod.intType(dst_signedness, bit_size % 64);
-
-        try self.truncateRegister(high_ty, high_reg);
-        if (!dst_mcv.isRegister()) try self.genCopy(
-            Type.usize,
-            dst_mcv.address().offset((dst_limbs_len - 1) * 8).deref(),
+        try self.truncateRegister(dst_ty, high_reg);
+        if (!high_mcv.isRegister()) try self.genCopy(
+            if (abi_size <= 8) dst_ty else Type.usize,
+            high_mcv,
             .{ .register = high_reg },
         );
         break :result dst_mcv;
@@ -13287,7 +13784,7 @@ fn airArrayToSlice(self: *Self, inst: Air.Inst.Index) !void {
     const array_ty = ptr_ty.childType(mod);
     const array_len = array_ty.arrayLen(mod);
 
-    const frame_index = try self.allocFrameIndex(FrameAlloc.initType(slice_ty, mod));
+    const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(slice_ty, mod));
     try self.genSetMem(.{ .frame = frame_index }, 0, ptr_ty, ptr);
     try self.genSetMem(
         .{ .frame = frame_index },
@@ -13497,7 +13994,7 @@ fn airCmpxchg(self: *Self, inst: Air.Inst.Index) !void {
     const ptr_mcv = try self.resolveInst(extra.ptr);
     const mem_size = Memory.Size.fromSize(val_abi_size);
     const ptr_mem: Memory = switch (ptr_mcv) {
-        .immediate, .register, .register_offset, .lea_frame => ptr_mcv.deref().mem(mem_size),
+        .immediate, .register, .register_offset, .lea_frame => try ptr_mcv.deref().mem(self, mem_size),
         else => .{
             .base = .{ .reg = try self.copyToTmpRegister(ptr_ty, ptr_mcv) },
             .mod = .{ .rm = .{ .size = mem_size } },
@@ -13563,7 +14060,7 @@ fn atomicOp(
     const val_abi_size: u32 = @intCast(val_ty.abiSize(mod));
     const mem_size = Memory.Size.fromSize(val_abi_size);
     const ptr_mem: Memory = switch (ptr_mcv) {
-        .immediate, .register, .register_offset, .lea_frame => ptr_mcv.deref().mem(mem_size),
+        .immediate, .register, .register_offset, .lea_frame => try ptr_mcv.deref().mem(self, mem_size),
         else => .{
             .base = .{ .reg = try self.copyToTmpRegister(ptr_ty, ptr_mcv) },
             .mod = .{ .rm = .{ .size = mem_size } },
@@ -13671,27 +14168,41 @@ fn atomicOp(
                         },
                     };
 
-                    try self.genBinOpMir(.{ ._, .cmp }, val_ty, tmp_mcv, val_mcv);
                     const cmov_abi_size = @max(val_abi_size, 2);
                     switch (val_mcv) {
-                        .register => |val_reg| try self.asmCmovccRegisterRegister(
-                            cc,
-                            registerAlias(tmp_reg, cmov_abi_size),
-                            registerAlias(val_reg, cmov_abi_size),
-                        ),
-                        .memory, .indirect, .load_frame => try self.asmCmovccRegisterMemory(
-                            cc,
-                            registerAlias(tmp_reg, cmov_abi_size),
-                            val_mcv.mem(Memory.Size.fromSize(cmov_abi_size)),
-                        ),
-                        else => {
-                            const val_reg = try self.copyToTmpRegister(val_ty, val_mcv);
+                        .register => |val_reg| {
+                            try self.genBinOpMir(.{ ._, .cmp }, val_ty, tmp_mcv, val_mcv);
                             try self.asmCmovccRegisterRegister(
                                 cc,
                                 registerAlias(tmp_reg, cmov_abi_size),
                                 registerAlias(val_reg, cmov_abi_size),
                             );
                         },
+                        .memory, .indirect, .load_frame => {
+                            try self.genBinOpMir(.{ ._, .cmp }, val_ty, tmp_mcv, val_mcv);
+                            try self.asmCmovccRegisterMemory(
+                                cc,
+                                registerAlias(tmp_reg, cmov_abi_size),
+                                try val_mcv.mem(self, Memory.Size.fromSize(cmov_abi_size)),
+                            );
+                        },
+                        else => {
+                            const mat_reg = try self.copyToTmpRegister(val_ty, val_mcv);
+                            const mat_lock = self.register_manager.lockRegAssumeUnused(mat_reg);
+                            defer self.register_manager.unlockReg(mat_lock);
+
+                            try self.genBinOpMir(
+                                .{ ._, .cmp },
+                                val_ty,
+                                tmp_mcv,
+                                .{ .register = mat_reg },
+                            );
+                            try self.asmCmovccRegisterRegister(
+                                cc,
+                                registerAlias(tmp_reg, cmov_abi_size),
+                                registerAlias(mat_reg, cmov_abi_size),
+                            );
+                        },
                     }
                 },
             };
@@ -13728,8 +14239,8 @@ fn atomicOp(
                     .reg = try self.copyToTmpRegister(Type.usize, val_mcv.address()),
                 } },
             };
-            const val_lo_mem = val_mem_mcv.mem(.qword);
-            const val_hi_mem = val_mem_mcv.address().offset(8).deref().mem(.qword);
+            const val_lo_mem = try val_mem_mcv.mem(self, .qword);
+            const val_hi_mem = try val_mem_mcv.address().offset(8).deref().mem(self, .qword);
             if (rmw_op != std.builtin.AtomicRmwOp.Xchg) {
                 try self.asmRegisterRegister(.{ ._, .mov }, .rbx, .rax);
                 try self.asmRegisterRegister(.{ ._, .mov }, .rcx, .rdx);
@@ -14000,7 +14511,7 @@ fn airMemcpy(self: *Self, inst: Air.Inst.Index) !void {
             try self.asmRegisterMemoryImmediate(
                 .{ .i_, .mul },
                 len_reg,
-                dst_ptr.address().offset(8).deref().mem(.qword),
+                try dst_ptr.address().offset(8).deref().mem(self, .qword),
                 Immediate.s(@intCast(dst_ptr_ty.childType(mod).abiSize(mod))),
             );
             break :len .{ .register = len_reg };
@@ -14171,28 +14682,162 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
     const mod = self.bin_file.options.module.?;
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
     const vector_ty = self.typeOfIndex(inst);
+    const vector_len = vector_ty.vectorLen(mod);
     const dst_rc = self.regClassForType(vector_ty);
-    const scalar_ty = vector_ty.scalarType(mod);
+    const scalar_ty = self.typeOf(ty_op.operand);
 
-    const src_mcv = try self.resolveInst(ty_op.operand);
     const result: MCValue = result: {
         switch (scalar_ty.zigTypeTag(mod)) {
             else => {},
+            .Bool => {
+                const regs =
+                    try self.register_manager.allocRegs(2, .{ inst, null }, abi.RegisterClass.gp);
+                const reg_locks = self.register_manager.lockRegsAssumeUnused(2, regs);
+                defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
+
+                try self.genSetReg(regs[1], vector_ty, .{ .immediate = 0 });
+                try self.genSetReg(
+                    regs[1],
+                    vector_ty,
+                    .{ .immediate = @as(u64, math.maxInt(u64)) >> @intCast(64 - vector_len) },
+                );
+                const src_mcv = try self.resolveInst(ty_op.operand);
+                const abi_size = @max(math.divCeil(u32, vector_len, 8) catch unreachable, 4);
+                try self.asmCmovccRegisterRegister(
+                    switch (src_mcv) {
+                        .eflags => |cc| cc,
+                        .register => |src_reg| cc: {
+                            try self.asmRegisterImmediate(
+                                .{ ._, .@"test" },
+                                src_reg.to8(),
+                                Immediate.u(1),
+                            );
+                            break :cc .nz;
+                        },
+                        else => cc: {
+                            try self.asmMemoryImmediate(
+                                .{ ._, .@"test" },
+                                try src_mcv.mem(self, .byte),
+                                Immediate.u(1),
+                            );
+                            break :cc .nz;
+                        },
+                    },
+                    registerAlias(regs[0], abi_size),
+                    registerAlias(regs[1], abi_size),
+                );
+                break :result .{ .register = regs[0] };
+            },
+            .Int => if (self.hasFeature(.avx2)) avx2: {
+                const mir_tag = @as(?Mir.Inst.FixedTag, switch (scalar_ty.intInfo(mod).bits) {
+                    else => null,
+                    1...8 => switch (vector_len) {
+                        else => null,
+                        1...32 => .{ .vp_b, .broadcast },
+                    },
+                    9...16 => switch (vector_len) {
+                        else => null,
+                        1...16 => .{ .vp_w, .broadcast },
+                    },
+                    17...32 => switch (vector_len) {
+                        else => null,
+                        1...8 => .{ .vp_d, .broadcast },
+                    },
+                    33...64 => switch (vector_len) {
+                        else => null,
+                        1...4 => .{ .vp_q, .broadcast },
+                    },
+                    65...128 => switch (vector_len) {
+                        else => null,
+                        1...2 => .{ .vp_i128, .broadcast },
+                    },
+                }) orelse break :avx2;
+
+                const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
+                const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
+                defer self.register_manager.unlockReg(dst_lock);
+
+                const src_mcv = try self.resolveInst(ty_op.operand);
+                if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                    mir_tag,
+                    registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod))),
+                    try src_mcv.mem(self, self.memSize(scalar_ty)),
+                ) else {
+                    if (mir_tag[0] == .vp_i128) break :avx2;
+                    try self.genSetReg(dst_reg, scalar_ty, src_mcv);
+                    try self.asmRegisterRegister(
+                        mir_tag,
+                        registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod))),
+                        registerAlias(dst_reg, @intCast(scalar_ty.abiSize(mod))),
+                    );
+                }
+                break :result .{ .register = dst_reg };
+            } else {
+                const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.sse);
+                const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
+                defer self.register_manager.unlockReg(dst_lock);
+
+                try self.genSetReg(dst_reg, scalar_ty, .{ .air_ref = ty_op.operand });
+                if (vector_len == 1) break :result .{ .register = dst_reg };
+
+                const dst_alias = registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod)));
+                const scalar_bits = scalar_ty.intInfo(mod).bits;
+                if (switch (scalar_bits) {
+                    1...8 => true,
+                    9...128 => false,
+                    else => unreachable,
+                }) if (self.hasFeature(.avx)) try self.asmRegisterRegisterRegister(
+                    .{ .vp_, .unpcklbw },
+                    dst_alias,
+                    dst_alias,
+                    dst_alias,
+                ) else try self.asmRegisterRegister(
+                    .{ .p_, .unpcklbw },
+                    dst_alias,
+                    dst_alias,
+                );
+                if (switch (scalar_bits) {
+                    1...8 => vector_len > 2,
+                    9...16 => true,
+                    17...128 => false,
+                    else => unreachable,
+                }) try self.asmRegisterRegisterImmediate(
+                    .{ if (self.hasFeature(.avx)) .vp_w else .p_w, .shufl },
+                    dst_alias,
+                    dst_alias,
+                    Immediate.u(0),
+                );
+                if (switch (scalar_bits) {
+                    1...8 => vector_len > 4,
+                    9...16 => vector_len > 2,
+                    17...64 => true,
+                    65...128 => false,
+                    else => unreachable,
+                }) try self.asmRegisterRegisterImmediate(
+                    .{ if (self.hasFeature(.avx)) .vp_d else .p_d, .shuf },
+                    dst_alias,
+                    dst_alias,
+                    Immediate.u(if (scalar_bits <= 64) 0b00_00_00_00 else 0b01_00_01_00),
+                );
+                break :result .{ .register = dst_reg };
+            },
             .Float => switch (scalar_ty.floatBits(self.target.*)) {
-                32 => switch (vector_ty.vectorLen(mod)) {
+                32 => switch (vector_len) {
                     1 => {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         try self.genSetReg(dst_reg, scalar_ty, src_mcv);
                         break :result .{ .register = dst_reg };
                     },
                     2...4 => {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         if (self.hasFeature(.avx)) {
                             const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 .{ .v_ss, .broadcast },
                                 dst_reg.to128(),
-                                src_mcv.mem(.dword),
+                                try src_mcv.mem(self, .dword),
                             ) else {
                                 const src_reg = if (src_mcv.isRegister())
                                     src_mcv.getReg().?
@@ -14224,11 +14869,12 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                         }
                     },
                     5...8 => if (self.hasFeature(.avx)) {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         if (src_mcv.isMemory()) try self.asmRegisterMemory(
                             .{ .v_ss, .broadcast },
                             dst_reg.to256(),
-                            src_mcv.mem(.dword),
+                            try src_mcv.mem(self, .dword),
                         ) else {
                             const src_reg = if (src_mcv.isRegister())
                                 src_mcv.getReg().?
@@ -14259,20 +14905,22 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     },
                     else => {},
                 },
-                64 => switch (vector_ty.vectorLen(mod)) {
+                64 => switch (vector_len) {
                     1 => {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         try self.genSetReg(dst_reg, scalar_ty, src_mcv);
                         break :result .{ .register = dst_reg };
                     },
                     2 => {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         if (self.hasFeature(.sse3)) {
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup },
                                 dst_reg.to128(),
-                                src_mcv.mem(.qword),
+                                try src_mcv.mem(self, .qword),
                             ) else try self.asmRegisterRegister(
                                 if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup },
                                 dst_reg.to128(),
@@ -14292,11 +14940,12 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                         );
                     },
                     3...4 => if (self.hasFeature(.avx)) {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         if (src_mcv.isMemory()) try self.asmRegisterMemory(
                             .{ .v_sd, .broadcast },
                             dst_reg.to256(),
-                            src_mcv.mem(.qword),
+                            try src_mcv.mem(self, .qword),
                         ) else {
                             const src_reg = if (src_mcv.isRegister())
                                 src_mcv.getReg().?
@@ -14325,19 +14974,21 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     },
                     else => {},
                 },
-                128 => switch (vector_ty.vectorLen(mod)) {
+                128 => switch (vector_len) {
                     1 => {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         try self.genSetReg(dst_reg, scalar_ty, src_mcv);
                         break :result .{ .register = dst_reg };
                     },
                     2 => if (self.hasFeature(.avx)) {
+                        const src_mcv = try self.resolveInst(ty_op.operand);
                         const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
                         if (src_mcv.isMemory()) try self.asmRegisterMemory(
                             .{ .v_f128, .broadcast },
                             dst_reg.to256(),
-                            src_mcv.mem(.xword),
+                            try src_mcv.mem(self, .xword),
                         ) else {
                             const src_reg = if (src_mcv.isRegister())
                                 src_mcv.getReg().?
@@ -14389,7 +15040,7 @@ fn airReduce(self: *Self, inst: Air.Inst.Index) !void {
             try self.spillEflagsIfOccupied();
 
             const operand_mcv = try self.resolveInst(reduce.operand);
-            const mask_len = (std.math.cast(u6, operand_ty.vectorLen(mod)) orelse
+            const mask_len = (math.cast(u6, operand_ty.vectorLen(mod)) orelse
                 return self.fail("TODO implement airReduce for {}", .{operand_ty.fmt(mod)}));
             const mask = (@as(u64, 1) << mask_len) - 1;
             const abi_size: u32 = @intCast(operand_ty.abiSize(mod));
@@ -14397,7 +15048,7 @@ fn airReduce(self: *Self, inst: Air.Inst.Index) !void {
                 .Or => {
                     if (operand_mcv.isMemory()) try self.asmMemoryImmediate(
                         .{ ._, .@"test" },
-                        operand_mcv.mem(Memory.Size.fromSize(abi_size)),
+                        try operand_mcv.mem(self, Memory.Size.fromSize(abi_size)),
                         Immediate.u(mask),
                     ) else {
                         const operand_reg = registerAlias(if (operand_mcv.isRegister())
@@ -14445,8 +15096,7 @@ fn airAggregateInit(self: *Self, inst: Air.Inst.Index) !void {
     const result: MCValue = result: {
         switch (result_ty.zigTypeTag(mod)) {
             .Struct => {
-                const frame_index =
-                    try self.allocFrameIndex(FrameAlloc.initType(result_ty, mod));
+                const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(result_ty, mod));
                 if (result_ty.containerLayout(mod) == .Packed) {
                     const struct_type = mod.typeToStruct(result_ty).?;
                     try self.genInlineMemset(
@@ -14542,8 +15192,7 @@ fn airAggregateInit(self: *Self, inst: Air.Inst.Index) !void {
                 break :result .{ .load_frame = .{ .index = frame_index } };
             },
             .Array => {
-                const frame_index =
-                    try self.allocFrameIndex(FrameAlloc.initType(result_ty, mod));
+                const frame_index = try self.allocFrameIndex(FrameAlloc.initSpill(result_ty, mod));
                 const elem_ty = result_ty.childType(mod);
                 const elem_size: u32 = @intCast(elem_ty.abiSize(mod));
 
@@ -14789,7 +15438,7 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
             mir_tag,
             mop1_reg,
             mop2_reg,
-            mops[2].mem(Memory.Size.fromSize(abi_size)),
+            try mops[2].mem(self, Memory.Size.fromSize(abi_size)),
         );
         break :result mops[0];
     };
@@ -14807,7 +15456,7 @@ fn airVaStart(self: *Self, inst: Air.Inst.Index) !void {
     )) {
         .SysV => result: {
             const info = self.va_info.sysv;
-            const dst_fi = try self.allocFrameIndex(FrameAlloc.initType(va_list_ty, mod));
+            const dst_fi = try self.allocFrameIndex(FrameAlloc.initSpill(va_list_ty, mod));
             var field_off: u31 = 0;
             // gp_offset: c_uint,
             try self.genSetMem(
@@ -15015,7 +15664,7 @@ fn airVaArg(self: *Self, inst: Air.Inst.Index) !void {
                 .{ .v_ss, .cvtsd2 },
                 dst_reg,
                 dst_reg,
-                promote_mcv.mem(.qword),
+                try promote_mcv.mem(self, .qword),
             ) else try self.asmRegisterRegisterRegister(
                 .{ .v_ss, .cvtsd2 },
                 dst_reg,
@@ -15027,7 +15676,7 @@ fn airVaArg(self: *Self, inst: Air.Inst.Index) !void {
             ) else if (promote_mcv.isMemory()) try self.asmRegisterMemory(
                 .{ ._ss, .cvtsd2 },
                 dst_reg,
-                promote_mcv.mem(.qword),
+                try promote_mcv.mem(self, .qword),
             ) else try self.asmRegisterRegister(
                 .{ ._ss, .cvtsd2 },
                 dst_reg,
@@ -15473,6 +16122,33 @@ fn memSize(self: *Self, ty: Type) Memory.Size {
     };
 }
 
+fn splitType(self: *Self, ty: Type) ![2]Type {
+    const mod = self.bin_file.options.module.?;
+    const classes = mem.sliceTo(&abi.classifySystemV(ty, mod, .other), .none);
+    var parts: [2]Type = undefined;
+    if (classes.len == 2) for (&parts, classes, 0..) |*part, class, part_i| {
+        part.* = switch (class) {
+            .integer => switch (part_i) {
+                0 => Type.u64,
+                1 => part: {
+                    const elem_size = ty.abiAlignment(mod).minStrict(.@"8").toByteUnitsOptional().?;
+                    const elem_ty = try mod.intType(.unsigned, @intCast(elem_size * 8));
+                    break :part switch (@divExact(ty.abiSize(mod) - 8, elem_size)) {
+                        1 => elem_ty,
+                        else => |len| try mod.arrayType(.{ .len = len, .child = elem_ty.toIntern() }),
+                    };
+                },
+                else => unreachable,
+            },
+            .float => Type.f32,
+            .float_combine => try mod.vectorType(.{ .len = 2, .child = .f32_type }),
+            .sse => Type.f64,
+            else => break,
+        };
+    } else if (parts[0].abiSize(mod) + parts[1].abiSize(mod) == ty.abiSize(mod)) return parts;
+    return self.fail("TODO implement splitType for {}", .{ty.fmt(mod)});
+}
+
 /// Truncates the value in the register in place.
 /// Clobbers any remaining bits.
 fn truncateRegister(self: *Self, ty: Type, reg: Register) !void {
src/arch/x86_64/Encoding.zig
@@ -410,6 +410,8 @@ pub const Mnemonic = enum {
     vfmadd132ps, vfmadd213ps, vfmadd231ps,
     vfmadd132sd, vfmadd213sd, vfmadd231sd,
     vfmadd132ss, vfmadd213ss, vfmadd231ss,
+    // AVX2
+    vpbroadcastb, vpbroadcastd, vpbroadcasti128, vpbroadcastq, vpbroadcastw,
     // zig fmt: on
 };
 
@@ -444,7 +446,7 @@ pub const Op = enum {
     moffs,
     sreg,
     st, mm, mm_m64,
-    xmm0, xmm, xmm_m32, xmm_m64, xmm_m128,
+    xmm0, xmm, xmm_m8, xmm_m16, xmm_m32, xmm_m64, xmm_m128,
     ymm, ymm_m256,
     // zig fmt: on
 
@@ -534,7 +536,7 @@ pub const Op = enum {
             .eax, .r32, .rm32, .r32_m16 => unreachable,
             .rax, .r64, .rm64, .r64_m16 => unreachable,
             .st, .mm, .mm_m64 => unreachable,
-            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable,
+            .xmm0, .xmm, .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable,
             .ymm, .ymm_m256 => unreachable,
             .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
             .unity => 1,
@@ -556,7 +558,7 @@ pub const Op = enum {
             .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32,
             .rax, .r64, .rm64, .r64_m16, .mm, .mm_m64 => 64,
             .st => 80,
-            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
+            .xmm0, .xmm, .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
             .ymm, .ymm_m256 => 256,
         };
     }
@@ -568,8 +570,8 @@ pub const Op = enum {
             .rel8, .rel16, .rel32 => unreachable,
             .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64 => unreachable,
             .st, .mm, .xmm0, .xmm, .ymm => unreachable,
-            .m8, .rm8, .r32_m8 => 8,
-            .m16, .rm16, .r32_m16, .r64_m16 => 16,
+            .m8, .rm8, .r32_m8, .xmm_m8 => 8,
+            .m16, .rm16, .r32_m16, .r64_m16, .xmm_m16 => 16,
             .m32, .rm32, .xmm_m32 => 32,
             .m64, .rm64, .mm_m64, .xmm_m64 => 64,
             .m80 => 80,
@@ -600,7 +602,7 @@ pub const Op = enum {
             .rm8, .rm16, .rm32, .rm64,
             .r32_m8, .r32_m16, .r64_m16,
             .st, .mm, .mm_m64,
-            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128,
+            .xmm0, .xmm, .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128,
             .ymm, .ymm_m256,
             => true,
             else => false,
@@ -629,7 +631,7 @@ pub const Op = enum {
             .m8, .m16, .m32, .m64, .m80, .m128, .m256,
             .m,
             .mm_m64,
-            .xmm_m32, .xmm_m64, .xmm_m128,
+            .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128,
             .ymm_m256,
             => true,
             else => false,
@@ -654,7 +656,7 @@ pub const Op = enum {
             .sreg => .segment,
             .st => .x87,
             .mm, .mm_m64 => .mmx,
-            .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .sse,
+            .xmm0, .xmm, .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128 => .sse,
             .ymm, .ymm_m256 => .sse,
         };
     }
src/arch/x86_64/encodings.zig
@@ -1742,6 +1742,16 @@ pub const table = [_]Entry{
 
     .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 },
 
+    .{ .vpbroadcastb,    .rm, &.{ .xmm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpbroadcastb,    .rm, &.{ .ymm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpbroadcastw,    .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x79 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpbroadcastw,    .rm, &.{ .ymm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x79 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpbroadcastd,    .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpbroadcastd,    .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpbroadcastq,    .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpbroadcastq,    .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpbroadcasti128, .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
+
     .{ .vpcmpeqb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_256_wig, .avx2 },
     .{ .vpcmpeqw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_256_wig, .avx2 },
     .{ .vpcmpeqd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x76 }, 0, .vex_256_wig, .avx2 },
src/arch/x86_64/Mir.zig
@@ -255,6 +255,8 @@ pub const Inst = struct {
         vp_q,
         /// VEX-Encoded Packed ___ Double Quadword
         vp_dq,
+        /// VEX-Encoded Packed ___ Integer Data
+        vp_i128,
         /// VEX-Encoded ___ Scalar Single-Precision Values
         v_ss,
         /// VEX-Encoded ___ Packed Single-Precision Values
src/link/Coff.zig
@@ -388,6 +388,7 @@ fn populateMissingMetadata(self: *Coff) !void {
         self.rdata_section_index = try self.allocateSection(".rdata", file_size, .{
             .CNT_INITIALIZED_DATA = 1,
             .MEM_READ = 1,
+            .MEM_WRITE = 1,
         });
     }
 
src/codegen.zig
@@ -376,7 +376,10 @@ pub fn generateSymbol(
                             .val = switch (aggregate.storage) {
                                 .bytes => unreachable,
                                 .elems => |elems| elems[@as(usize, @intCast(index))],
-                                .repeated_elem => |elem| elem,
+                                .repeated_elem => |elem| if (index < array_type.len)
+                                    elem
+                                else
+                                    array_type.sentinel,
                             }.toValue(),
                         }, code, debug_output, reloc_info)) {
                             .ok => {},
src/Compilation.zig
@@ -1121,7 +1121,9 @@ pub fn create(gpa: Allocator, options: InitOptions) !*Compilation {
         const include_compiler_rt = options.want_compiler_rt orelse needs_c_symbols;
 
         const must_single_thread = target_util.isSingleThreaded(options.target);
-        const single_threaded = options.single_threaded orelse must_single_thread;
+        const single_threaded = options.single_threaded orelse must_single_thread or
+            // x86_64 codegen doesn't support TLV for most object formats
+            (!use_llvm and options.target.cpu.arch == .x86_64 and options.target.ofmt != .macho);
         if (must_single_thread and !single_threaded) {
             return error.TargetRequiresSingleThreaded;
         }
test/behavior/vector.zig
@@ -1260,7 +1260,6 @@ test "zero multiplicand" {
 
 test "@intCast to u0" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO