Commit 228c956377

Jacob Young <jacobly0@users.noreply.github.com>
2023-07-31 07:56:43
std: finish cleanup up asm
This also required implementing the necessary syntax in the x86_64 backend.
1 parent 2ba787e
lib/compiler_rt/arm.zig
@@ -9,7 +9,7 @@ pub const panic = common.panic;
 
 comptime {
     if (!builtin.is_test) {
-        if (arch.isARM() or arch.isThumb()) {
+        if (arch.isArmOrThumb()) {
             @export(__aeabi_unwind_cpp_pr0, .{ .name = "__aeabi_unwind_cpp_pr0", .linkage = common.linkage, .visibility = common.visibility });
             @export(__aeabi_unwind_cpp_pr1, .{ .name = "__aeabi_unwind_cpp_pr1", .linkage = common.linkage, .visibility = common.visibility });
             @export(__aeabi_unwind_cpp_pr2, .{ .name = "__aeabi_unwind_cpp_pr2", .linkage = common.linkage, .visibility = common.visibility });
lib/compiler_rt/clzsi2_test.zig
@@ -268,7 +268,7 @@ test "clzsi2" {
     try test__clzsi2(0xFE000000, 0);
     try test__clzsi2(0xFF000000, 0);
     // arm and thumb1 assume input a != 0
-    if (!builtin.cpu.arch.isARM() and !builtin.cpu.arch.isThumb())
+    if (!builtin.cpu.arch.isArmOrThumb())
         try test__clzsi2(0x00000000, 32);
     try test__clzsi2(0x00000001, 31);
     try test__clzsi2(0x00000002, 30);
lib/std/os/linux/arm-eabi.zig
@@ -103,44 +103,40 @@ const CloneFn = *const fn (arg: usize) callconv(.C) u8;
 /// This matches the libc clone function.
 pub extern fn clone(func: CloneFn, stack: usize, flags: u32, arg: usize, ptid: *i32, tls: usize, ctid: *i32) usize;
 
-pub fn restore() callconv(.Naked) void {
-    if (@import("builtin").zig_backend == .stage2_c) {
-        asm volatile (
+pub fn restore() callconv(.Naked) noreturn {
+    switch (@import("builtin").zig_backend) {
+        .stage2_c => asm volatile (
             \\ mov r7, %[number]
             \\ svc #0
-            \\ bx lr
             :
-            : [number] "i" (@intFromEnum(SYS.sigreturn)),
+            : [number] "I" (@intFromEnum(SYS.sigreturn)),
             : "memory"
-        );
-        unreachable;
+        ),
+        else => asm volatile (
+            \\ svc #0
+            :
+            : [number] "{r7}" (@intFromEnum(SYS.sigreturn)),
+            : "memory"
+        ),
     }
-
-    asm volatile ("svc #0"
-        :
-        : [number] "{r7}" (@intFromEnum(SYS.sigreturn)),
-        : "memory"
-    );
 }
 
-pub fn restore_rt() callconv(.Naked) void {
-    if (@import("builtin").zig_backend == .stage2_c) {
-        asm volatile (
+pub fn restore_rt() callconv(.Naked) noreturn {
+    switch (@import("builtin").zig_backend) {
+        .stage2_c => asm volatile (
             \\ mov r7, %[number]
             \\ svc #0
-            \\ bx lr
             :
-            : [number] "i" (@intFromEnum(SYS.rt_sigreturn)),
+            : [number] "I" (@intFromEnum(SYS.rt_sigreturn)),
             : "memory"
-        );
-        unreachable;
+        ),
+        else => asm volatile (
+            \\ svc #0
+            :
+            : [number] "{r7}" (@intFromEnum(SYS.rt_sigreturn)),
+            : "memory"
+        ),
     }
-
-    asm volatile ("svc #0"
-        :
-        : [number] "{r7}" (@intFromEnum(SYS.rt_sigreturn)),
-        : "memory"
-    );
 }
 
 pub const MMAP2_UNIT = 4096;
lib/std/os/linux/arm64.zig
@@ -105,25 +105,22 @@ pub extern fn clone(func: CloneFn, stack: usize, flags: u32, arg: usize, ptid: *
 
 pub const restore = restore_rt;
 
-pub fn restore_rt() callconv(.Naked) void {
-    if (@import("builtin").zig_backend == .stage2_c) {
-        asm volatile (
+pub fn restore_rt() callconv(.Naked) noreturn {
+    switch (@import("builtin").zig_backend) {
+        .stage2_c => asm volatile (
             \\ mov x8, %[number]
             \\ svc #0
-            \\ ret
             :
             : [number] "i" (@intFromEnum(SYS.rt_sigreturn)),
             : "memory", "cc"
-        );
-        unreachable;
+        ),
+        else => asm volatile (
+            \\ svc #0
+            :
+            : [number] "{x8}" (@intFromEnum(SYS.rt_sigreturn)),
+            : "memory", "cc"
+        ),
     }
-
-    asm volatile (
-        \\ svc #0
-        :
-        : [number] "{x8}" (@intFromEnum(SYS.rt_sigreturn)),
-        : "memory", "cc"
-    );
 }
 
 pub const O = struct {
lib/std/os/linux/mips.zig
@@ -195,16 +195,18 @@ const CloneFn = *const fn (arg: usize) callconv(.C) u8;
 /// This matches the libc clone function.
 pub extern fn clone(func: CloneFn, stack: usize, flags: u32, arg: usize, ptid: *i32, tls: usize, ctid: *i32) usize;
 
-pub fn restore() callconv(.Naked) void {
-    return asm volatile ("syscall"
+pub fn restore() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ syscall
         :
         : [number] "{$2}" (@intFromEnum(SYS.sigreturn)),
         : "$1", "$3", "$4", "$5", "$6", "$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", "$25", "hi", "lo", "memory"
     );
 }
 
-pub fn restore_rt() callconv(.Naked) void {
-    return asm volatile ("syscall"
+pub fn restore_rt() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ syscall
         :
         : [number] "{$2}" (@intFromEnum(SYS.rt_sigreturn)),
         : "$1", "$3", "$4", "$5", "$6", "$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", "$25", "hi", "lo", "memory"
lib/std/os/linux/mips64.zig
@@ -180,16 +180,18 @@ const CloneFn = *const fn (arg: usize) callconv(.C) u8;
 /// This matches the libc clone function.
 pub extern fn clone(func: CloneFn, stack: usize, flags: u32, arg: usize, ptid: *i32, tls: usize, ctid: *i32) usize;
 
-pub fn restore() callconv(.Naked) void {
-    return asm volatile ("syscall"
+pub fn restore() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ syscall
         :
         : [number] "{$2}" (@intFromEnum(SYS.rt_sigreturn)),
         : "$1", "$3", "$4", "$5", "$6", "$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", "$25", "hi", "lo", "memory"
     );
 }
 
-pub fn restore_rt() callconv(.Naked) void {
-    return asm volatile ("syscall"
+pub fn restore_rt() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ syscall
         :
         : [number] "{$2}" (@intFromEnum(SYS.rt_sigreturn)),
         : "$1", "$3", "$4", "$5", "$6", "$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", "$25", "hi", "lo", "memory"
lib/std/os/linux/powerpc.zig
@@ -133,8 +133,9 @@ pub extern fn clone(func: CloneFn, stack: usize, flags: usize, arg: usize, ptid:
 
 pub const restore = restore_rt;
 
-pub fn restore_rt() callconv(.Naked) void {
-    return asm volatile ("sc"
+pub fn restore_rt() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ sc
         :
         : [number] "{r0}" (@intFromEnum(SYS.rt_sigreturn)),
         : "memory", "cr0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
lib/std/os/linux/powerpc64.zig
@@ -133,8 +133,9 @@ pub extern fn clone(func: CloneFn, stack: usize, flags: usize, arg: usize, ptid:
 
 pub const restore = restore_rt;
 
-pub fn restore_rt() callconv(.Naked) void {
-    return asm volatile ("sc"
+pub fn restore_rt() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ sc
         :
         : [number] "{r0}" (@intFromEnum(SYS.rt_sigreturn)),
         : "memory", "cr0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
lib/std/os/linux/riscv64.zig
@@ -101,8 +101,9 @@ pub extern fn clone(func: CloneFn, stack: usize, flags: u32, arg: usize, ptid: *
 
 pub const restore = restore_rt;
 
-pub fn restore_rt() callconv(.Naked) void {
-    return asm volatile ("ecall"
+pub fn restore_rt() callconv(.Naked) noreturn {
+    asm volatile (
+        \\ ecall
         :
         : [number] "{x17}" (@intFromEnum(SYS.rt_sigreturn)),
         : "memory"
lib/std/os/linux/thumb.zig
@@ -141,8 +141,8 @@ pub fn syscall6(
     );
 }
 
-pub fn restore() callconv(.Naked) void {
-    return asm volatile (
+pub fn restore() callconv(.Naked) noreturn {
+    asm volatile (
         \\ mov r7, %[number]
         \\ svc #0
         :
@@ -150,8 +150,8 @@ pub fn restore() callconv(.Naked) void {
     );
 }
 
-pub fn restore_rt() callconv(.Naked) void {
-    return asm volatile (
+pub fn restore_rt() callconv(.Naked) noreturn {
+    asm volatile (
         \\ mov r7, %[number]
         \\ svc #0
         :
lib/std/os/linux/tls.zig
@@ -48,7 +48,7 @@ const TLSVariant = enum {
 };
 
 const tls_variant = switch (native_arch) {
-    .arm, .armeb, .thumb, .aarch64, .aarch64_be, .riscv32, .riscv64, .mips, .mipsel, .mips64, .mips64el, .powerpc, .powerpc64, .powerpc64le => TLSVariant.VariantI,
+    .arm, .armeb, .thumb, .aarch64, .aarch64_be, .riscv32, .riscv64, .mips, .mipsel, .mips64, .mips64el, .powerpc, .powerpcle, .powerpc64, .powerpc64le => TLSVariant.VariantI,
     .x86_64, .x86, .sparc64 => TLSVariant.VariantII,
     else => @compileError("undefined tls_variant for this architecture"),
 };
@@ -140,7 +140,7 @@ pub fn setThreadPointer(addr: usize) void {
             const rc = std.os.linux.syscall2(.arch_prctl, std.os.linux.ARCH.SET_FS, addr);
             assert(rc == 0);
         },
-        .aarch64 => {
+        .aarch64, .aarch64_be => {
             asm volatile (
                 \\ msr tpidr_el0, %[addr]
                 :
@@ -162,7 +162,7 @@ pub fn setThreadPointer(addr: usize) void {
             const rc = std.os.linux.syscall1(.set_thread_area, addr);
             assert(rc == 0);
         },
-        .powerpc => {
+        .powerpc, .powerpcle => {
             asm volatile (
                 \\ mr 2, %[addr]
                 :
lib/std/os/linux/x86.zig
@@ -123,46 +123,40 @@ const CloneFn = *const fn (arg: usize) callconv(.C) u8;
 /// This matches the libc clone function.
 pub extern fn clone(func: CloneFn, stack: usize, flags: u32, arg: usize, ptid: *i32, tls: usize, ctid: *i32) usize;
 
-pub fn restore() callconv(.Naked) void {
-    if (@import("builtin").zig_backend == .stage2_c) {
-        asm volatile (
+pub fn restore() callconv(.Naked) noreturn {
+    switch (@import("builtin").zig_backend) {
+        .stage2_c => asm volatile (
             \\ movl %[number], %%eax
             \\ int $0x80
-            \\ retl
             :
             : [number] "i" (@intFromEnum(SYS.sigreturn)),
             : "memory"
-        );
-        unreachable;
+        ),
+        else => asm volatile (
+            \\ int $0x80
+            :
+            : [number] "{eax}" (@intFromEnum(SYS.sigreturn)),
+            : "memory"
+        ),
     }
-
-    asm volatile (
-        \\ int $0x80
-        :
-        : [number] "{eax}" (@intFromEnum(SYS.sigreturn)),
-        : "memory"
-    );
 }
 
-pub fn restore_rt() callconv(.Naked) void {
-    if (@import("builtin").zig_backend == .stage2_c) {
-        asm volatile (
+pub fn restore_rt() callconv(.Naked) noreturn {
+    switch (@import("builtin").zig_backend) {
+        .stage2_c => asm volatile (
             \\ movl %[number], %%eax
             \\ int $0x80
-            \\ retl
             :
             : [number] "i" (@intFromEnum(SYS.rt_sigreturn)),
             : "memory"
-        );
-        unreachable;
+        ),
+        else => asm volatile (
+            \\ int $0x80
+            :
+            : [number] "{eax}" (@intFromEnum(SYS.rt_sigreturn)),
+            : "memory"
+        ),
     }
-
-    asm volatile (
-        \\ int $0x80
-        :
-        : [number] "{eax}" (@intFromEnum(SYS.rt_sigreturn)),
-        : "memory"
-    );
 }
 
 pub const O = struct {
@@ -401,7 +395,7 @@ noinline fn getContextReturnAddress() usize {
     return @returnAddress();
 }
 
-pub fn getContextInternal() callconv(.Naked) void {
+pub fn getContextInternal() callconv(.Naked) usize {
     asm volatile (
         \\ movl $0, %[flags_offset:c](%%edx)
         \\ movl $0, %[link_offset:c](%%edx)
@@ -439,6 +433,7 @@ pub fn getContextInternal() callconv(.Naked) void {
         \\0:
         \\ popl %%esi
         \\ popl %%ebx
+        \\ retl
         :
         : [flags_offset] "i" (@offsetOf(ucontext_t, "flags")),
           [link_offset] "i" (@offsetOf(ucontext_t, "link")),
@@ -466,7 +461,7 @@ pub fn getContextInternal() callconv(.Naked) void {
 pub inline fn getcontext(context: *ucontext_t) usize {
     // This method is used so that getContextInternal can control
     // its prologue in order to read ESP from a constant offset.
-    // The unused &getContextInternal input is required so the function is included in the binary.
+    // An aligned stack is not needed for getContextInternal.
     var clobber_edx: usize = undefined;
     return asm volatile (
         \\ calll %[getContextInternal:P]
lib/std/os/linux/x86_64.zig
@@ -107,25 +107,22 @@ pub extern fn clone(func: CloneFn, stack: usize, flags: usize, arg: usize, ptid:
 
 pub const restore = restore_rt;
 
-pub fn restore_rt() callconv(.Naked) void {
-    if (@import("builtin").zig_backend == .stage2_c) {
-        asm volatile (
+pub fn restore_rt() callconv(.Naked) noreturn {
+    switch (@import("builtin").zig_backend) {
+        .stage2_c => asm volatile (
             \\ movl %[number], %%eax
             \\ syscall
-            \\ retq
             :
             : [number] "i" (@intFromEnum(SYS.rt_sigreturn)),
             : "rcx", "r11", "memory"
-        );
-        unreachable;
+        ),
+        else => asm volatile (
+            \\ syscall
+            :
+            : [number] "{rax}" (@intFromEnum(SYS.rt_sigreturn)),
+            : "rcx", "r11", "memory"
+        ),
     }
-
-    asm volatile (
-        \\ syscall
-        :
-        : [number] "{rax}" (@intFromEnum(SYS.rt_sigreturn)),
-        : "rcx", "r11", "memory"
-    );
 }
 
 pub const mode_t = usize;
@@ -400,7 +397,7 @@ fn gpRegisterOffset(comptime reg_index: comptime_int) usize {
     return @offsetOf(ucontext_t, "mcontext") + @offsetOf(mcontext_t, "gregs") + @sizeOf(usize) * reg_index;
 }
 
-fn getContextInternal() callconv(.Naked) void {
+fn getContextInternal() callconv(.Naked) usize {
     // TODO: Read GS/FS registers?
     asm volatile (
         \\ movq $0, %[flags_offset:c](%%rdi)
@@ -444,6 +441,7 @@ fn getContextInternal() callconv(.Naked) void {
         \\ movl %[sigset_size], %%r10d
         \\ syscall
         \\0:
+        \\ retq
         :
         : [flags_offset] "i" (@offsetOf(ucontext_t, "flags")),
           [link_offset] "i" (@offsetOf(ucontext_t, "link")),
@@ -480,7 +478,7 @@ fn getContextInternal() callconv(.Naked) void {
 pub inline fn getcontext(context: *ucontext_t) usize {
     // This method is used so that getContextInternal can control
     // its prologue in order to read RSP from a constant offset
-    // The unused &getContextInternal input is required so the function is included in the binary.
+    // An aligned stack is not needed for getContextInternal.
     var clobber_rdi: usize = undefined;
     return asm volatile (
         \\ callq %[getContextInternal:P]
lib/std/os/linux.zig
@@ -35,13 +35,13 @@ const syscall_bits = switch (native_arch) {
 const arch_bits = switch (native_arch) {
     .x86 => @import("linux/x86.zig"),
     .x86_64 => @import("linux/x86_64.zig"),
-    .aarch64 => @import("linux/arm64.zig"),
+    .aarch64, .aarch64_be => @import("linux/arm64.zig"),
     .arm, .thumb => @import("linux/arm-eabi.zig"),
     .riscv64 => @import("linux/riscv64.zig"),
     .sparc64 => @import("linux/sparc64.zig"),
     .mips, .mipsel => @import("linux/mips.zig"),
     .mips64, .mips64el => @import("linux/mips64.zig"),
-    .powerpc => @import("linux/powerpc.zig"),
+    .powerpc, .powerpcle => @import("linux/powerpc.zig"),
     .powerpc64, .powerpc64le => @import("linux/powerpc64.zig"),
     else => struct {},
 };
@@ -98,13 +98,13 @@ pub const syscalls = @import("linux/syscalls.zig");
 pub const SYS = switch (@import("builtin").cpu.arch) {
     .x86 => syscalls.X86,
     .x86_64 => syscalls.X64,
-    .aarch64 => syscalls.Arm64,
+    .aarch64, .aarch64_be => syscalls.Arm64,
     .arm, .thumb => syscalls.Arm,
     .riscv64 => syscalls.RiscV64,
     .sparc64 => syscalls.Sparc64,
     .mips, .mipsel => syscalls.Mips,
     .mips64, .mips64el => syscalls.Mips64,
-    .powerpc => syscalls.PowerPC,
+    .powerpc, .powerpcle => syscalls.PowerPC,
     .powerpc64, .powerpc64le => syscalls.PowerPC64,
     else => @compileError("The Zig Standard Library is missing syscall definitions for the target CPU architecture"),
 };
@@ -1176,14 +1176,12 @@ pub fn sigaction(sig: u6, noalias act: ?*const Sigaction, noalias oact: ?*Sigact
     const mask_size = @sizeOf(@TypeOf(ksa.mask));
 
     if (act) |new| {
-        const restore_rt_ptr = &restore_rt;
-        const restore_ptr = &restore;
-        const restorer_fn = if ((new.flags & SA.SIGINFO) != 0) restore_rt_ptr else restore_ptr;
+        const restorer_fn = if ((new.flags & SA.SIGINFO) != 0) &restore_rt else &restore;
         ksa = k_sigaction{
             .handler = new.handler.handler,
             .flags = new.flags | SA.RESTORER,
             .mask = undefined,
-            .restorer = @as(k_sigaction_funcs.restorer, @ptrCast(restorer_fn)),
+            .restorer = @ptrCast(restorer_fn),
         };
         @memcpy(@as([*]u8, @ptrCast(&ksa.mask))[0..mask_size], @as([*]const u8, @ptrCast(&new.mask)));
     }
lib/std/builtin.zig
@@ -568,7 +568,7 @@ pub const VaListX86_64 = extern struct {
 /// This data structure is used by the Zig language code generation and
 /// therefore must be kept in sync with the compiler implementation.
 pub const VaList = switch (builtin.cpu.arch) {
-    .aarch64 => switch (builtin.os.tag) {
+    .aarch64, .aarch64_be => switch (builtin.os.tag) {
         .windows => *u8,
         .ios, .macos, .tvos, .watchos => *u8,
         else => @compileError("disabled due to miscompilations"), // VaListAarch64,
lib/std/start.zig
@@ -101,7 +101,7 @@ fn main2() callconv(.C) c_int {
     return 0;
 }
 
-fn _start2() callconv(.Naked) noreturn {
+fn _start2() noreturn {
     callMain2();
 }
 
@@ -254,116 +254,92 @@ fn EfiMain(handle: uefi.Handle, system_table: *uefi.tables.SystemTable) callconv
 }
 
 fn _start() callconv(.Naked) noreturn {
-    switch (builtin.zig_backend) {
-        .stage2_c => {
-            asm volatile (switch (native_arch) {
-                    .x86_64 =>
-                    \\ xorl %%ebp, %%ebp
-                    \\ movq %%rsp, %[argc_argv_ptr]
-                    \\ andq $-16, %%rsp
-                    \\ callq %[posixCallMainAndExit:P]
-                    ,
-                    .x86 =>
-                    \\ xorl %%ebp, %%ebp
-                    \\ movl %%esp, %[argc_argv_ptr]
-                    \\ andl $-16, %%esp
-                    \\ calll %[posixCallMainAndExit:P]
-                    ,
-                    .aarch64, .aarch64_be =>
-                    \\ mov fp, #0
-                    \\ mov lr, #0
-                    \\ mov x0, sp
-                    \\ str x0, %[argc_argv_ptr]
-                    \\ b %[posixCallMainAndExit]
-                    ,
-                    .arm, .armeb, .thumb =>
-                    \\ mov fp, #0
-                    \\ mov lr, #0
-                    \\ str sp, %[argc_argv_ptr]
-                    \\ and sp, #-16
-                    \\ b %[posixCallMainAndExit]
-                    ,
-                    else => @compileError("unsupported arch"),
-                }
-                : [argc_argv_ptr] "=m" (argc_argv_ptr),
-                : [posixCallMainAndExit] "X" (&posixCallMainAndExit),
-            );
-            unreachable;
-        },
-        else => switch (native_arch) {
-            .x86_64 => {
-                argc_argv_ptr = asm volatile (
-                    \\ xor %%ebp, %%ebp
-                    : [argc] "={rsp}" (-> [*]usize),
-                );
-            },
-            .x86 => {
-                argc_argv_ptr = asm volatile (
-                    \\ xor %%ebp, %%ebp
-                    : [argc] "={esp}" (-> [*]usize),
-                );
-            },
-            .aarch64, .aarch64_be, .arm, .armeb, .thumb => {
-                argc_argv_ptr = asm volatile (
-                    \\ mov fp, #0
-                    \\ mov lr, #0
-                    : [argc] "={sp}" (-> [*]usize),
-                );
-            },
-            .riscv64 => {
-                argc_argv_ptr = asm volatile (
-                    \\ li s0, 0
-                    \\ li ra, 0
-                    : [argc] "={sp}" (-> [*]usize),
-                );
-            },
-            .mips, .mipsel, .mips64, .mips64el => {
-                // The lr is already zeroed on entry, as specified by the ABI.
-                argc_argv_ptr = asm volatile (
-                    \\ move $fp, $0
-                    : [argc] "={sp}" (-> [*]usize),
-                );
-            },
-            .powerpc => {
-                // Setup the initial stack frame and clear the back chain pointer.
-                argc_argv_ptr = asm volatile (
-                    \\ mr 4, 1
-                    \\ li 0, 0
-                    \\ stwu 1,-16(1)
-                    \\ stw 0, 0(1)
-                    \\ mtlr 0
-                    : [argc] "={r4}" (-> [*]usize),
-                    :
-                    : "r0"
-                );
-            },
-            .powerpc64le => {
-                // Setup the initial stack frame and clear the back chain pointer.
-                // TODO: Support powerpc64 (big endian) on ELFv2.
-                argc_argv_ptr = asm volatile (
-                    \\ mr 4, 1
-                    \\ li 0, 0
-                    \\ stdu 0, -32(1)
-                    \\ mtlr 0
-                    : [argc] "={r4}" (-> [*]usize),
-                    :
-                    : "r0"
-                );
-            },
-            .sparc64 => {
-                // argc is stored after a register window (16 registers) plus stack bias
-                argc_argv_ptr = asm (
-                    \\ mov %%g0, %%i6
-                    \\ add %%o6, 2175, %[argc]
-                    : [argc] "=r" (-> [*]usize),
-                );
-            },
+    asm volatile (switch (native_arch) {
+            .x86_64 =>
+            \\ xorl %%ebp, %%ebp
+            \\ movq %%rsp, %[argc_argv_ptr]
+            \\ andq $-16, %%rsp
+            \\ callq %[posixCallMainAndExit:P]
+            ,
+            .x86 =>
+            \\ xorl %%ebp, %%ebp
+            \\ movl %%esp, %[argc_argv_ptr]
+            \\ andl $-16, %%esp
+            \\ calll %[posixCallMainAndExit:P]
+            ,
+            .aarch64, .aarch64_be =>
+            \\ mov fp, #0
+            \\ mov lr, #0
+            \\ mov x0, sp
+            \\ str x0, %[argc_argv_ptr]
+            \\ b %[posixCallMainAndExit]
+            ,
+            .arm, .armeb, .thumb, .thumbeb =>
+            \\ mov fp, #0
+            \\ mov lr, #0
+            \\ str sp, %[argc_argv_ptr]
+            \\ and sp, #-16
+            \\ b %[posixCallMainAndExit]
+            ,
+            .riscv64 =>
+            \\ li s0, 0
+            \\ li ra, 0
+            \\ sd sp, %[argc_argv_ptr]
+            \\ andi sp, sp, -16
+            \\ tail %[posixCallMainAndExit]@plt
+            ,
+            .mips, .mipsel =>
+            // The lr is already zeroed on entry, as specified by the ABI.
+            \\ addiu $fp, $zero, 0
+            \\ sw $sp, %[argc_argv_ptr]
+            \\ .set push
+            \\ .set noat
+            \\ addiu $1, $zero, -16
+            \\ and $sp, $sp, $1
+            \\ .set pop
+            \\ j %[posixCallMainAndExit]
+            ,
+            .mips64, .mips64el =>
+            // The lr is already zeroed on entry, as specified by the ABI.
+            \\ addiu $fp, $zero, 0
+            \\ sd $sp, %[argc_argv_ptr]
+            \\ .set push
+            \\ .set noat
+            \\ daddiu $1, $zero, -16
+            \\ and $sp, $sp, $1
+            \\ .set pop
+            \\ j %[posixCallMainAndExit]
+            ,
+            .powerpc, .powerpcle =>
+            // Setup the initial stack frame and clear the back chain pointer.
+            \\ stw 1, %[argc_argv_ptr]
+            \\ li 0, 0
+            \\ stwu 1, -16(1)
+            \\ stw 0, 0(1)
+            \\ mtlr 0
+            \\ b %[posixCallMainAndExit]
+            ,
+            .powerpc64, .powerpc64le =>
+            // Setup the initial stack frame and clear the back chain pointer.
+            // TODO: Support powerpc64 (big endian) on ELFv2.
+            \\ std 1, %[argc_argv_ptr]
+            \\ li 0, 0
+            \\ stdu 0, -32(1)
+            \\ mtlr 0
+            \\ b %[posixCallMainAndExit]
+            ,
+            .sparc64 =>
+            // argc is stored after a register window (16 registers) plus stack bias
+            \\ mov %%g0, %%i6
+            \\ add %%o6, 2175, %%l0
+            \\ stx %%l0, %[argc_argv_ptr]
+            \\ ba %[posixCallMainAndExit]
+            ,
             else => @compileError("unsupported arch"),
-        },
-    }
-    // If LLVM inlines stack variables into _start, they will overwrite
-    // the command line argument data.
-    @call(.never_inline, posixCallMainAndExit, .{});
+        }
+        : [argc_argv_ptr] "=m" (argc_argv_ptr),
+        : [posixCallMainAndExit] "X" (&posixCallMainAndExit),
+    );
 }
 
 fn WinStartup() callconv(std.os.windows.WINAPI) noreturn {
@@ -390,8 +366,6 @@ fn wWinMainCRTStartup() callconv(std.os.windows.WINAPI) noreturn {
 }
 
 fn posixCallMainAndExit() callconv(.C) noreturn {
-    @setAlignStack(16);
-
     const argc = argc_argv_ptr[0];
     const argv = @as([*][*:0]u8, @ptrCast(argc_argv_ptr + 1));
 
lib/std/target.zig
@@ -899,6 +899,10 @@ pub const Target = struct {
                 };
             }
 
+            pub fn isArmOrThumb(arch: Arch) bool {
+                return arch.isARM() or arch.isThumb();
+            }
+
             pub fn isWasm(arch: Arch) bool {
                 return switch (arch) {
                     .wasm32, .wasm64 => true,
@@ -1960,6 +1964,7 @@ pub const Target = struct {
             .thumbeb,
             => return if (target.os.tag.isDarwin() or target.os.tag == .windows) .signed else .unsigned,
             .powerpc, .powerpc64 => return if (target.os.tag.isDarwin()) .signed else .unsigned,
+            .powerpcle,
             .powerpc64le,
             .s390x,
             .xcore,
lib/c.zig
@@ -200,72 +200,67 @@ fn clone() callconv(.Naked) void {
             // syscall(SYS_clone, flags, stack, ptid, tls, ctid)
             //         eax,       ebx,   ecx,   edx,  esi, edi
             asm volatile (
-                \\  push %%ebp
-                \\  mov %%esp,%%ebp
-                \\  push %%ebx
-                \\  push %%esi
-                \\  push %%edi
+                \\  pushl %%ebp
+                \\  movl %%esp,%%ebp
+                \\  pushl %%ebx
+                \\  pushl %%esi
+                \\  pushl %%edi
                 \\  // Setup the arguments
-                \\  mov 16(%%ebp),%%ebx
-                \\  mov 12(%%ebp),%%ecx
-                \\  and $-16,%%ecx
-                \\  sub $20,%%ecx
-                \\  mov 20(%%ebp),%%eax
-                \\  mov %%eax,4(%%ecx)
-                \\  mov 8(%%ebp),%%eax
-                \\  mov %%eax,0(%%ecx)
-                \\  mov 24(%%ebp),%%edx
-                \\  mov 28(%%ebp),%%esi
-                \\  mov 32(%%ebp),%%edi
-                \\  mov $120,%%eax
+                \\  movl 16(%%ebp),%%ebx
+                \\  movl 12(%%ebp),%%ecx
+                \\  andl $-16,%%ecx
+                \\  subl $20,%%ecx
+                \\  movl 20(%%ebp),%%eax
+                \\  movl %%eax,4(%%ecx)
+                \\  movl 8(%%ebp),%%eax
+                \\  movl %%eax,0(%%ecx)
+                \\  movl 24(%%ebp),%%edx
+                \\  movl 28(%%ebp),%%esi
+                \\  movl 32(%%ebp),%%edi
+                \\  movl $120,%%eax
                 \\  int $128
-                \\  test %%eax,%%eax
+                \\  testl %%eax,%%eax
                 \\  jnz 1f
-                \\  pop %%eax
-                \\  xor %%ebp,%%ebp
-                \\  call *%%eax
-                \\  mov %%eax,%%ebx
-                \\  xor %%eax,%%eax
-                \\  inc %%eax
+                \\  popl %%eax
+                \\  xorl %%ebp,%%ebp
+                \\  calll *%%eax
+                \\  movl %%eax,%%ebx
+                \\  movl $1,%%eax
                 \\  int $128
-                \\  hlt
                 \\1:
-                \\  pop %%edi
-                \\  pop %%esi
-                \\  pop %%ebx
-                \\  pop %%ebp
-                \\  ret
+                \\  popl %%edi
+                \\  popl %%esi
+                \\  popl %%ebx
+                \\  popl %%ebp
+                \\  retl
             );
         },
         .x86_64 => {
             asm volatile (
-                \\      xor %%eax,%%eax
-                \\      mov $56,%%al // SYS_clone
-                \\      mov %%rdi,%%r11
-                \\      mov %%rdx,%%rdi
-                \\      mov %%r8,%%rdx
-                \\      mov %%r9,%%r8
-                \\      mov 8(%%rsp),%%r10
-                \\      mov %%r11,%%r9
-                \\      and $-16,%%rsi
-                \\      sub $8,%%rsi
-                \\      mov %%rcx,(%%rsi)
+                \\      movl $56,%%eax // SYS_clone
+                \\      movq %%rdi,%%r11
+                \\      movq %%rdx,%%rdi
+                \\      movq %%r8,%%rdx
+                \\      movq %%r9,%%r8
+                \\      movq 8(%%rsp),%%r10
+                \\      movq %%r11,%%r9
+                \\      andq $-16,%%rsi
+                \\      subq $8,%%rsi
+                \\      movq %%rcx,(%%rsi)
                 \\      syscall
-                \\      test %%eax,%%eax
+                \\      testq %%rax,%%rax
                 \\      jnz 1f
-                \\      xor %%ebp,%%ebp
-                \\      pop %%rdi
-                \\      call *%%r9
-                \\      mov %%eax,%%edi
-                \\      xor %%eax,%%eax
-                \\      mov $60,%%al // SYS_exit
+                \\      xorl %%ebp,%%ebp
+                \\      popq %%rdi
+                \\      callq *%%r9
+                \\      movl %%eax,%%edi
+                \\      movl $60,%%eax // SYS_exit
                 \\      syscall
-                \\      hlt
                 \\1:    ret
                 \\
             );
         },
-        .aarch64 => {
+        .aarch64, .aarch64_be => {
             // __clone(func, stack, flags, arg, ptid, tls, ctid)
             //         x0,   x1,    w2,    x3,  x4,   x5,  x6
 
@@ -400,69 +395,69 @@ fn clone() callconv(.Naked) void {
                 \\  syscall
             );
         },
-        .powerpc => {
+        .powerpc, .powerpcle => {
             // __clone(func, stack, flags, arg, ptid, tls, ctid)
             //            3,     4,     5,   6,    7,   8,    9
 
             // syscall(SYS_clone, flags, stack, ptid, tls, ctid)
             //                 0      3,     4,    5,   6,    7
             asm volatile (
-                \\# store non-volatile regs r30, r31 on stack in order to put our
-                \\# start func and its arg there
-                \\stwu 30, -16(1)
-                \\stw 31, 4(1)
+                \\ # store non-volatile regs r30, r31 on stack in order to put our
+                \\ # start func and its arg there
+                \\ stwu 30, -16(1)
+                \\ stw 31, 4(1)
                 \\
-                \\# save r3 (func) into r30, and r6(arg) into r31
-                \\mr 30, 3
-                \\mr 31, 6
+                \\ # save r3 (func) into r30, and r6(arg) into r31
+                \\ mr 30, 3
+                \\ mr 31, 6
                 \\
-                \\# create initial stack frame for new thread
-                \\clrrwi 4, 4, 4
-                \\li 0, 0
-                \\stwu 0, -16(4)
+                \\ # create initial stack frame for new thread
+                \\ clrrwi 4, 4, 4
+                \\ li 0, 0
+                \\ stwu 0, -16(4)
                 \\
-                \\#move c into first arg
-                \\mr 3, 5
-                \\#mr 4, 4
-                \\mr 5, 7
-                \\mr 6, 8
-                \\mr 7, 9
+                \\ #move c into first arg
+                \\ mr 3, 5
+                \\ #mr 4, 4
+                \\ mr 5, 7
+                \\ mr 6, 8
+                \\ mr 7, 9
                 \\
-                \\# move syscall number into r0
-                \\li 0, 120
+                \\ # move syscall number into r0
+                \\ li 0, 120
                 \\
-                \\sc
+                \\ sc
                 \\
-                \\# check for syscall error
-                \\bns+ 1f # jump to label 1 if no summary overflow.
-                \\#else
-                \\neg 3, 3 #negate the result (errno)
-                \\1:
-                \\# compare sc result with 0
-                \\cmpwi cr7, 3, 0
+                \\ # check for syscall error
+                \\ bns+ 1f # jump to label 1 if no summary overflow.
+                \\ #else
+                \\ neg 3, 3 #negate the result (errno)
+                \\ 1:
+                \\ # compare sc result with 0
+                \\ cmpwi cr7, 3, 0
                 \\
-                \\# if not 0, jump to end
-                \\bne cr7, 2f
+                \\ # if not 0, jump to end
+                \\ bne cr7, 2f
                 \\
-                \\#else: we're the child
-                \\#call funcptr: move arg (d) into r3
-                \\mr 3, 31
-                \\#move r30 (funcptr) into CTR reg
-                \\mtctr 30
-                \\# call CTR reg
-                \\bctrl
-                \\# mov SYS_exit into r0 (the exit param is already in r3)
-                \\li 0, 1
-                \\sc
+                \\ #else: we're the child
+                \\ #call funcptr: move arg (d) into r3
+                \\ mr 3, 31
+                \\ #move r30 (funcptr) into CTR reg
+                \\ mtctr 30
+                \\ # call CTR reg
+                \\ bctrl
+                \\ # mov SYS_exit into r0 (the exit param is already in r3)
+                \\ li 0, 1
+                \\ sc
                 \\
-                \\2:
+                \\ 2:
                 \\
-                \\# restore stack
-                \\lwz 30, 0(1)
-                \\lwz 31, 4(1)
-                \\addi 1, 1, 16
+                \\ # restore stack
+                \\ lwz 30, 0(1)
+                \\ lwz 31, 4(1)
+                \\ addi 1, 1, 16
                 \\
-                \\blr
+                \\ blr
             );
         },
         .powerpc64, .powerpc64le => {
src/arch/x86_64/CodeGen.zig
@@ -177,7 +177,7 @@ pub const MCValue = union(enum) {
     /// The value is a tuple { wrapped, overflow } where wrapped value is stored in the GP register.
     register_overflow: struct { reg: Register, eflags: Condition },
     /// The value is in memory at a hard-coded address.
-    /// If the type is a pointer, it means the pointer address is at this memory location.
+    /// If the type is a pointer, it means the pointer address is stored at this memory location.
     memory: u64,
     /// The value is in memory at a constant offset from the address in a register.
     indirect: RegisterOffset,
@@ -300,7 +300,7 @@ pub const MCValue = union(enum) {
             .load_tlv,
             .load_frame,
             .reserved_frame,
-            => unreachable, // not a dereferenceable
+            => unreachable, // not dereferenceable
             .immediate => |addr| .{ .memory = addr },
             .register => |reg| .{ .indirect = .{ .reg = reg } },
             .register_offset => |reg_off| .{ .indirect = reg_off },
@@ -3468,14 +3468,14 @@ fn genInlineIntDivFloor(self: *Self, ty: Type, lhs: MCValue, rhs: MCValue) !MCVa
     const mod = self.bin_file.options.module.?;
     const abi_size: u32 = @intCast(ty.abiSize(mod));
     const int_info = ty.intInfo(mod);
-    const dividend: Register = switch (lhs) {
+    const dividend = switch (lhs) {
         .register => |reg| reg,
         else => try self.copyToTmpRegister(ty, lhs),
     };
     const dividend_lock = self.register_manager.lockReg(dividend);
     defer if (dividend_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const divisor: Register = switch (rhs) {
+    const divisor = switch (rhs) {
         .register => |reg| reg,
         else => try self.copyToTmpRegister(ty, rhs),
     };
@@ -9184,6 +9184,7 @@ fn airBr(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.options.module.?;
     const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
     const extra = self.air.extraData(Air.Asm, ty_pl.payload);
     const clobbers_len: u31 = @truncate(extra.data.flags);
@@ -9196,23 +9197,15 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
 
     var result: MCValue = .none;
     var args = std.StringArrayHashMap(MCValue).init(self.gpa);
-    try args.ensureTotalCapacity(outputs.len + inputs.len + clobbers_len);
+    try args.ensureTotalCapacity(outputs.len + inputs.len);
     defer {
-        for (args.values()) |arg| switch (arg) {
-            .register => |reg| self.register_manager.unlockReg(.{ .register = reg }),
-            else => {},
-        };
+        for (args.values()) |arg| if (arg.getReg()) |reg|
+            self.register_manager.unlockReg(.{ .register = reg });
         args.deinit();
     }
 
-    if (outputs.len > 1) {
-        return self.fail("TODO implement codegen for asm with more than 1 output", .{});
-    }
-
+    var outputs_extra_i = extra_i;
     for (outputs) |output| {
-        if (output != .none) {
-            return self.fail("TODO implement codegen for non-expr asm", .{});
-        }
         const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
         const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
         const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
@@ -9220,21 +9213,48 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         // for the string, we still use the next u32 for the null terminator.
         extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
-        const mcv: MCValue = if (mem.eql(u8, constraint, "=r"))
-            .{ .register = self.register_manager.tryAllocReg(inst, gp) orelse
-                return self.fail("ran out of registers lowering inline asm", .{}) }
+        const maybe_inst = switch (output) {
+            .none => inst,
+            else => null,
+        };
+        const ty = switch (output) {
+            .none => self.typeOfIndex(inst),
+            else => self.typeOf(output).childType(mod),
+        };
+        const arg_maybe_reg: ?Register = if (mem.eql(u8, constraint, "=r"))
+            self.register_manager.tryAllocReg(maybe_inst, regClassForType(ty, mod)) orelse
+                return self.fail("ran out of registers lowering inline asm", .{})
+        else if (mem.eql(u8, constraint, "=m"))
+            if (output != .none) null else return self.fail(
+                "memory constraint unsupported for asm result",
+                .{},
+            )
+        else if (mem.eql(u8, constraint, "=g"))
+            self.register_manager.tryAllocReg(maybe_inst, regClassForType(ty, mod)) orelse
+                if (output != .none) null else return self.fail(
+                "ran out of register lowering inline asm",
+                .{},
+            )
         else if (mem.startsWith(u8, constraint, "={") and mem.endsWith(u8, constraint, "}"))
-            .{ .register = parseRegName(constraint["={".len .. constraint.len - "}".len]) orelse
-                return self.fail("unrecognized register constraint: '{s}'", .{constraint}) }
+            parseRegName(constraint["={".len .. constraint.len - "}".len]) orelse
+                return self.fail("invalid register constraint: '{s}'", .{constraint})
         else
-            return self.fail("unrecognized constraint: '{s}'", .{constraint});
-        args.putAssumeCapacity(name, mcv);
-        switch (mcv) {
-            .register => |reg| _ = if (RegisterManager.indexOfRegIntoTracked(reg)) |_|
-                self.register_manager.lockRegAssumeUnused(reg),
-            else => {},
-        }
-        if (output == .none) result = mcv;
+            return self.fail("invalid constraint: '{s}'", .{constraint});
+        const arg_mcv: MCValue = if (arg_maybe_reg) |reg| .{ .register = reg } else arg: {
+            const ptr_mcv = try self.resolveInst(output);
+            switch (ptr_mcv) {
+                .immediate => |addr| if (math.cast(i32, @as(i64, @bitCast(addr)))) |_|
+                    break :arg ptr_mcv.deref(),
+                .register, .register_offset, .lea_frame => break :arg ptr_mcv.deref(),
+                else => {},
+            }
+            break :arg .{ .indirect = .{ .reg = try self.copyToTmpRegister(Type.usize, ptr_mcv) } };
+        };
+        if (arg_mcv.getReg()) |reg| if (RegisterManager.indexOfRegIntoTracked(reg)) |_| {
+            _ = self.register_manager.lockRegAssumeUnused(reg);
+        };
+        args.putAssumeCapacity(name, arg_mcv);
+        if (output == .none) result = arg_mcv;
     }
 
     for (inputs) |input| {
@@ -9245,16 +9265,53 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         // for the string, we still use the next u32 for the null terminator.
         extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
-        if (constraint.len < 3 or constraint[0] != '{' or constraint[constraint.len - 1] != '}') {
-            return self.fail("unrecognized asm input constraint: '{s}'", .{constraint});
-        }
-        const reg_name = constraint[1 .. constraint.len - 1];
-        const reg = parseRegName(reg_name) orelse
-            return self.fail("unrecognized register: '{s}'", .{reg_name});
-
-        const arg_mcv = try self.resolveInst(input);
-        try self.register_manager.getReg(reg, null);
-        try self.genSetReg(reg, self.typeOf(input), arg_mcv);
+        const ty = self.typeOf(input);
+        const input_mcv = try self.resolveInst(input);
+        const arg_mcv: MCValue = if (mem.eql(u8, constraint, "r")) switch (input_mcv) {
+            .register => input_mcv,
+            else => .{ .register = try self.copyToTmpRegister(ty, input_mcv) },
+        } else if (mem.eql(u8, constraint, "m")) arg: {
+            switch (input_mcv) {
+                .memory => |addr| if (math.cast(i32, @as(i64, @bitCast(addr)))) |_|
+                    break :arg input_mcv,
+                .indirect, .load_frame => break :arg input_mcv,
+                .load_direct, .load_got, .load_tlv => {},
+                else => {
+                    const temp_mcv = try self.allocTempRegOrMem(ty, false);
+                    try self.genCopy(ty, temp_mcv, input_mcv);
+                    break :arg temp_mcv;
+                },
+            }
+            const addr_reg = self.register_manager.tryAllocReg(null, gp) orelse {
+                const temp_mcv = try self.allocTempRegOrMem(ty, false);
+                try self.genCopy(ty, temp_mcv, input_mcv);
+                break :arg temp_mcv;
+            };
+            try self.genSetReg(addr_reg, Type.usize, input_mcv.address());
+            break :arg .{ .indirect = .{ .reg = addr_reg } };
+        } else if (mem.eql(u8, constraint, "g")) arg: {
+            switch (input_mcv) {
+                .register, .indirect, .load_frame => break :arg input_mcv,
+                .memory => |addr| if (math.cast(i32, @as(i64, @bitCast(addr)))) |_|
+                    break :arg input_mcv,
+                else => {},
+            }
+            const temp_mcv = try self.allocTempRegOrMem(ty, true);
+            try self.genCopy(ty, temp_mcv, input_mcv);
+            break :arg temp_mcv;
+        } else if (mem.eql(u8, constraint, "X"))
+            input_mcv
+        else if (mem.startsWith(u8, constraint, "{") and mem.endsWith(u8, constraint, "}")) arg: {
+            const reg = parseRegName(constraint["{".len .. constraint.len - "}".len]) orelse
+                return self.fail("invalid register constraint: '{s}'", .{constraint});
+            try self.register_manager.getReg(reg, null);
+            try self.genSetReg(reg, ty, input_mcv);
+            break :arg .{ .register = reg };
+        } else return self.fail("invalid constraint: '{s}'", .{constraint});
+        if (arg_mcv.getReg()) |reg| if (RegisterManager.indexOfRegIntoTracked(reg)) |_| {
+            _ = self.register_manager.lockReg(reg);
+        };
+        args.putAssumeCapacity(name, arg_mcv);
     }
 
     {
@@ -9293,7 +9350,7 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
                 }
             }
             break :mnem std.meta.stringToEnum(Mir.Inst.Tag, mnem_str) orelse
-                return self.fail("Invalid mnemonic: '{s}'", .{mnem_str});
+                return self.fail("invalid mnemonic: '{s}'", .{mnem_str});
         } };
 
         var op_it = mem.tokenizeScalar(u8, mnem_it.rest(), ',');
@@ -9304,43 +9361,73 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             if (mem.startsWith(u8, op_str, "%%")) {
                 const colon = mem.indexOfScalarPos(u8, op_str, "%%".len + 2, ':');
                 const reg = parseRegName(op_str["%%".len .. colon orelse op_str.len]) orelse
-                    return self.fail("Invalid register: '{s}'", .{op_str});
+                    return self.fail("invalid register: '{s}'", .{op_str});
                 if (colon) |colon_pos| {
                     const disp = std.fmt.parseInt(i32, op_str[colon_pos + 1 ..], 0) catch
-                        return self.fail("Invalid displacement: '{s}'", .{op_str});
+                        return self.fail("invalid displacement: '{s}'", .{op_str});
                     op.* = .{ .mem = Memory.sib(
-                        mnem_size orelse return self.fail("Unknown size: '{s}'", .{op_str}),
+                        mnem_size orelse return self.fail("unknown size: '{s}'", .{op_str}),
                         .{ .base = .{ .reg = reg }, .disp = disp },
                     ) };
                 } else {
                     if (mnem_size) |size| if (reg.bitSize() != size.bitSize())
-                        return self.fail("Invalid register size: '{s}'", .{op_str});
+                        return self.fail("invalid register size: '{s}'", .{op_str});
                     op.* = .{ .reg = reg };
                 }
             } else if (mem.startsWith(u8, op_str, "%[") and mem.endsWith(u8, op_str, "]")) {
-                switch (args.get(op_str["%[".len .. op_str.len - "]".len]) orelse
-                    return self.fail("No matching constraint: '{s}'", .{op_str})) {
-                    .register => |reg| op.* = .{ .reg = reg },
-                    else => return self.fail("Invalid constraint: '{s}'", .{op_str}),
-                }
+                const colon = mem.indexOfScalarPos(u8, op_str, "%[".len, ':');
+                const modifier = if (colon) |colon_pos|
+                    op_str[colon_pos + 1 .. op_str.len - "]".len]
+                else
+                    "";
+                op.* = switch (args.get(op_str["%[".len .. colon orelse op_str.len - "]".len]) orelse
+                    return self.fail("no matching constraint: '{s}'", .{op_str})) {
+                    .register => |reg| if (std.mem.eql(u8, modifier, ""))
+                        .{ .reg = reg }
+                    else
+                        return self.fail("invalid modifier: '{s}'", .{modifier}),
+                    .memory => |addr| if (std.mem.eql(u8, modifier, "") or
+                        std.mem.eql(u8, modifier, "P"))
+                        .{ .mem = Memory.sib(
+                            mnem_size orelse return self.fail("unknown size: '{s}'", .{op_str}),
+                            .{ .base = .{ .reg = .ds }, .disp = @intCast(@as(i64, @bitCast(addr))) },
+                        ) }
+                    else
+                        return self.fail("invalid modifier: '{s}'", .{modifier}),
+                    .indirect => |reg_off| if (std.mem.eql(u8, modifier, ""))
+                        .{ .mem = Memory.sib(
+                            mnem_size orelse return self.fail("unknown size: '{s}'", .{op_str}),
+                            .{ .base = .{ .reg = reg_off.reg }, .disp = reg_off.off },
+                        ) }
+                    else
+                        return self.fail("invalid modifier: '{s}'", .{modifier}),
+                    .load_frame => |frame_addr| if (std.mem.eql(u8, modifier, ""))
+                        .{ .mem = Memory.sib(
+                            mnem_size orelse return self.fail("unknown size: '{s}'", .{op_str}),
+                            .{ .base = .{ .frame = frame_addr.index }, .disp = frame_addr.off },
+                        ) }
+                    else
+                        return self.fail("invalid modifier: '{s}'", .{modifier}),
+                    else => return self.fail("invalid constraint: '{s}'", .{op_str}),
+                };
             } else if (mem.startsWith(u8, op_str, "$")) {
                 if (std.fmt.parseInt(i32, op_str["$".len..], 0)) |s| {
                     if (mnem_size) |size| {
                         const max = @as(u64, math.maxInt(u64)) >> @intCast(64 - (size.bitSize() - 1));
                         if ((if (s < 0) ~s else s) > max)
-                            return self.fail("Invalid immediate size: '{s}'", .{op_str});
+                            return self.fail("invalid immediate size: '{s}'", .{op_str});
                     }
                     op.* = .{ .imm = Immediate.s(s) };
                 } else |_| if (std.fmt.parseInt(u64, op_str["$".len..], 0)) |u| {
                     if (mnem_size) |size| {
                         const max = @as(u64, math.maxInt(u64)) >> @intCast(64 - size.bitSize());
                         if (u > max)
-                            return self.fail("Invalid immediate size: '{s}'", .{op_str});
+                            return self.fail("invalid immediate size: '{s}'", .{op_str});
                     }
                     op.* = .{ .imm = Immediate.u(u) };
-                } else |_| return self.fail("Invalid immediate: '{s}'", .{op_str});
-            } else return self.fail("Invalid operand: '{s}'", .{op_str});
-        } else if (op_it.next()) |op_str| return self.fail("Extra operand: '{s}'", .{op_str});
+                } else |_| return self.fail("invalid immediate: '{s}'", .{op_str});
+            } else return self.fail("invalid operand: '{s}'", .{op_str});
+        } else if (op_it.next()) |op_str| return self.fail("extra operand: '{s}'", .{op_str});
 
         (switch (ops[0]) {
             .none => self.asmOpOnly(mnem_tag),
@@ -9407,6 +9494,20 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         };
     }
 
+    for (outputs, args.values()[0..outputs.len]) |output, mcv| {
+        const extra_bytes = std.mem.sliceAsBytes(self.air.extra[outputs_extra_i..]);
+        const constraint =
+            std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[outputs_extra_i..]), 0);
+        const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
+        // This equation accounts for the fact that even if we have exactly 4 bytes
+        // for the string, we still use the next u32 for the null terminator.
+        outputs_extra_i += (constraint.len + name.len + (2 + 3)) / 4;
+
+        if (output == .none) continue;
+        if (mcv != .register) continue;
+        try self.store(self.typeOf(output), try self.resolveInst(output), mcv);
+    }
+
     simple: {
         var buf = [1]Air.Inst.Ref{.none} ** (Liveness.bpi - 1);
         var buf_index: usize = 0;
src/codegen/c.zig
@@ -4770,9 +4770,11 @@ fn airSwitchBr(f: *Function, inst: Air.Inst.Index) !CValue {
 }
 
 fn asmInputNeedsLocal(f: *Function, constraint: []const u8, value: CValue) bool {
+    const target = f.object.dg.module.getTarget();
     return switch (constraint[0]) {
         '{' => true,
         'i', 'r' => false,
+        'I' => !target.cpu.arch.isArmOrThumb(),
         else => switch (value) {
             .constant => |val| switch (f.object.dg.module.intern_pool.indexToKey(val)) {
                 .ptr => |ptr| switch (ptr.addr) {
src/codegen/llvm.zig
@@ -417,7 +417,7 @@ const DataLayoutBuilder = struct {
                 if (idx != size) try writer.print(":{d}", .{idx});
             }
         }
-        if (self.target.cpu.arch.isARM() or self.target.cpu.arch.isThumb())
+        if (self.target.cpu.arch.isArmOrThumb())
             try writer.writeAll("-Fi8"); // for thumb interwork
         if (self.target.cpu.arch != .hexagon) {
             if (self.target.cpu.arch == .s390x) try self.typeAlignment(.integer, 1, 8, 8, false, writer);
@@ -620,7 +620,7 @@ const DataLayoutBuilder = struct {
                     else => {},
                 }
             },
-            .vector => if (self.target.cpu.arch.isARM() or self.target.cpu.arch.isThumb()) {
+            .vector => if (self.target.cpu.arch.isArmOrThumb()) {
                 switch (size) {
                     128 => abi = 64,
                     else => {},
@@ -670,7 +670,7 @@ const DataLayoutBuilder = struct {
                 else => {},
             },
             .aggregate => if (self.target.os.tag == .uefi or self.target.os.tag == .windows or
-                self.target.cpu.arch.isARM() or self.target.cpu.arch.isThumb())
+                self.target.cpu.arch.isArmOrThumb())
             {
                 pref = @min(pref, self.target.ptrBitWidth());
             } else if (self.target.cpu.arch == .hexagon) {
@@ -6809,8 +6809,6 @@ pub const FuncGen = struct {
             }
             llvm_constraints.appendAssumeCapacity('=');
 
-            // Pass any non-return outputs indirectly, if the constraint accepts a memory location
-            is_indirect.* = (output != .none) and constraintAllowsMemory(constraint);
             if (output != .none) {
                 const output_inst = try self.resolveInst(output);
                 const output_ty = self.typeOf(output);
@@ -6825,6 +6823,8 @@ pub const FuncGen = struct {
                     }),
                 }
 
+                // Pass any non-return outputs indirectly, if the constraint accepts a memory location
+                is_indirect.* = constraintAllowsMemory(constraint);
                 if (is_indirect.*) {
                     // Pass the result by reference as an indirect output (e.g. "=*m")
                     llvm_constraints.appendAssumeCapacity('*');
@@ -6841,11 +6841,13 @@ pub const FuncGen = struct {
             } else {
                 switch (constraint[0]) {
                     '=' => {},
-                    else => return self.todo("unsupported output constraint on result type '{c}'", .{
-                        constraint[0],
+                    else => return self.todo("unsupported output constraint on result type '{s}'", .{
+                        constraint,
                     }),
                 }
 
+                is_indirect.* = false;
+
                 const ret_ty = self.typeOfIndex(inst);
                 llvm_ret_types[llvm_ret_i] = try o.lowerType(ret_ty);
                 llvm_ret_i += 1;
src/link/Elf.zig
@@ -1654,7 +1654,7 @@ fn linkWithLLD(self: *Elf, comp: *Compilation, prog_node: *std.Progress.Node) !v
         }
 
         if (self.base.options.link_mode == .Static) {
-            if (target.cpu.arch.isARM() or target.cpu.arch.isThumb()) {
+            if (target.cpu.arch.isArmOrThumb()) {
                 try argv.append("-Bstatic");
             } else {
                 try argv.append("-static");
test/behavior/bitcast.zig
@@ -421,7 +421,7 @@ test "bitcast nan float does modify signaling bit" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     // TODO: https://github.com/ziglang/zig/issues/14366
-    if (builtin.cpu.arch == .arm and builtin.zig_backend == .stage2_llvm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_llvm and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     // 16 bit
     const snan_f16_const = math.nan_f16;
test/behavior/call_tail.zig
@@ -32,7 +32,16 @@ noinline fn insertionSort(data: []u64) void {
 
 test "arguments pointed to on stack into tailcall" {
     switch (builtin.cpu.arch) {
-        .wasm32, .mips, .mipsel, .powerpc, .powerpcle, .powerpc64le => return error.SkipZigTest,
+        .wasm32,
+        .mips,
+        .mipsel,
+        .mips64,
+        .mips64el,
+        .powerpc,
+        .powerpcle,
+        .powerpc64,
+        .powerpc64le,
+        => return error.SkipZigTest,
         else => {},
     }
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
test/behavior/cast.zig
@@ -123,7 +123,7 @@ test "@floatFromInt(f80)" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest(comptime Int: type) !void {
@@ -1370,7 +1370,7 @@ test "cast f16 to wider types" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest() !void {
test/behavior/eval.zig
@@ -533,7 +533,7 @@ test "runtime 128 bit integer division" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     var a: u128 = 152313999999999991610955792383;
     var b: u128 = 10000000000000000000;
test/behavior/floatop.zig
@@ -554,7 +554,7 @@ test "another, possibly redundant, @fabs test" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testFabsLegacy(f128, 12.0);
     try comptime testFabsLegacy(f128, 12.0);
@@ -577,7 +577,7 @@ test "@fabs f80" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testFabsLegacy(f80, 12.0);
     try comptime testFabsLegacy(f80, 12.0);
@@ -595,7 +595,7 @@ test "a third @fabs test, surely there should not be three fabs tests" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     inline for ([_]type{ f16, f32, f64, f80, f128, c_longdouble }) |T| {
         // normals
@@ -687,7 +687,7 @@ test "@floor f80" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_llvm and builtin.os.tag == .windows) {
         // https://github.com/ziglang/zig/issues/12602
@@ -704,7 +704,7 @@ test "@floor f128" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testFloorLegacy(f128, 12.0);
     try comptime testFloorLegacy(f128, 12.0);
@@ -785,7 +785,7 @@ test "@ceil f80" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_llvm and builtin.os.tag == .windows) {
         // https://github.com/ziglang/zig/issues/12602
@@ -802,7 +802,7 @@ test "@ceil f128" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testCeilLegacy(f128, 12.0);
     try comptime testCeilLegacy(f128, 12.0);
@@ -882,7 +882,7 @@ test "@trunc f80" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_llvm and builtin.os.tag == .windows) {
         // https://github.com/ziglang/zig/issues/12602
@@ -905,7 +905,7 @@ test "@trunc f128" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testTruncLegacy(f128, 12.0);
     try comptime testTruncLegacy(f128, 12.0);
test/behavior/math.zig
@@ -664,7 +664,7 @@ test "128-bit multiplication" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     {
         var a: i128 = 3;
@@ -1312,7 +1312,7 @@ test "remainder division" {
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_llvm and builtin.os.tag == .windows) {
         // https://github.com/ziglang/zig/issues/12602
@@ -1457,7 +1457,7 @@ test "@round f80" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testRound(f80, 12.0);
     try comptime testRound(f80, 12.0);
@@ -1470,7 +1470,7 @@ test "@round f128" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try testRound(f128, 12.0);
     try comptime testRound(f128, 12.0);
test/behavior/maximum_minimum.zig
@@ -110,7 +110,7 @@ test "@min/max for floats" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     const S = struct {
         fn doTheTest(comptime T: type) !void {
test/behavior/muladd.zig
@@ -56,7 +56,7 @@ test "@mulAdd f80" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try comptime testMulAdd80();
     try testMulAdd80();
@@ -76,7 +76,7 @@ test "@mulAdd f128" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try comptime testMulAdd128();
     try testMulAdd128();
@@ -179,7 +179,7 @@ test "vector f80" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try comptime vector80();
     try vector80();
@@ -204,7 +204,7 @@ test "vector f128" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     try comptime vector128();
     try vector128();
test/behavior/saturating_arithmetic.zig
@@ -157,7 +157,7 @@ test "saturating multiplication" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_llvm and builtin.cpu.arch == .wasm32) {
         // https://github.com/ziglang/zig/issues/9660
test/behavior/struct.zig
@@ -428,7 +428,7 @@ test "packed struct 24bits" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.cpu.arch == .wasm32) return error.SkipZigTest; // TODO
-    if (builtin.cpu.arch == .arm) return error.SkipZigTest; // TODO
+    if (comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
 
@@ -824,7 +824,7 @@ test "non-packed struct with u128 entry in union" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     const U = union(enum) {
         Num: u128,
@@ -945,7 +945,7 @@ test "comptime struct field" {
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
 
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
-    if (builtin.cpu.arch == .arm) return error.SkipZigTest; // TODO
+    if (comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest; // TODO
 
     const T = struct {
         a: i32,
test/behavior/var_args.zig
@@ -101,7 +101,7 @@ test "simple variadic function" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.cpu.arch == .aarch64 and builtin.os.tag != .macos) {
+    if (builtin.os.tag != .macos and comptime builtin.cpu.arch.isAARCH64()) {
         // https://github.com/ziglang/zig/issues/14096
         return error.SkipZigTest;
     }
@@ -151,7 +151,7 @@ test "variadic functions" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.cpu.arch == .aarch64 and builtin.os.tag != .macos) {
+    if (builtin.os.tag != .macos and comptime builtin.cpu.arch.isAARCH64()) {
         // https://github.com/ziglang/zig/issues/14096
         return error.SkipZigTest;
     }
@@ -195,7 +195,7 @@ test "copy VaList" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.cpu.arch == .aarch64 and builtin.os.tag != .macos) {
+    if (builtin.os.tag != .macos and comptime builtin.cpu.arch.isAARCH64()) {
         // https://github.com/ziglang/zig/issues/14096
         return error.SkipZigTest;
     }
@@ -228,7 +228,7 @@ test "unused VaList arg" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.cpu.arch == .aarch64 and builtin.os.tag != .macos) {
+    if (builtin.os.tag != .macos and comptime builtin.cpu.arch.isAARCH64()) {
         // https://github.com/ziglang/zig/issues/14096
         return error.SkipZigTest;
     }
test/behavior/vector.zig
@@ -102,7 +102,7 @@ test "vector float operators" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     inline for ([_]type{ f16, f32, f64, f80, f128 }) |T| {
         const S = struct {
@@ -495,6 +495,7 @@ test "vector division operators" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_llvm and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
 
@@ -706,7 +707,7 @@ test "vector reduce operation" {
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c and builtin.cpu.arch == .arm) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_c and comptime builtin.cpu.arch.isArmOrThumb()) return error.SkipZigTest;
 
     const S = struct {
         fn testReduce(comptime op: std.builtin.ReduceOp, x: anytype, expected: anytype) !void {
test/src/Cases.zig
@@ -1524,7 +1524,7 @@ fn runOneCase(
                         }
                     } else switch (host.getExternalExecutor(target_info, .{ .link_libc = case.link_libc })) {
                         .native => {
-                            if (case.backend == .stage2 and case.target.getCpuArch() == .arm) {
+                            if (case.backend == .stage2 and case.target.getCpuArch().isArmOrThumb()) {
                                 // https://github.com/ziglang/zig/issues/13623
                                 continue :update; // Pass test.
                             }