Commit `9eb400ef19`

mlugg <mlugg@mlugg.co.uk>

2025-05-29 06:38:55

compiler: rework backend pipeline to separate codegen and link

The idea here is that instead of the linker calling into codegen, instead codegen should run before we touch the linker, and after MIR is produced, it is sent to the linker. Aside from simplifying the call graph (by preventing N linkers from each calling into M codegen backends!), this has the huge benefit that it is possible to parallellize codegen separately from linking. The threading model can look like this: * 1 semantic analysis thread, which generates AIR * N codegen threads, which process AIR into MIR * 1 linker thread, which emits MIR to the binary The codegen threads are also responsible for `Air.Legalize` and `Air.Liveness`; it's more efficient to do this work here instead of blocking the main thread for this trivially parallel task. I have repurposed the `Zcu.Feature.separate_thread` backend feature to indicate support for this 1:N:1 threading pattern. This commit makes the C backend support this feature, since it was relatively easy to divorce from `link.C`: it just required eliminating some shared buffers. Other backends don't currently support this feature. In fact, they don't even compile -- the next few commits will fix them back up.

master

1 parent 66d15d9

Changed files (23)

@@ -3,6 +3,7 @@ const builtin = @import("builtin");
 const assert = std.debug.assert;
 const mem = std.mem;
 const log = std.log.scoped(.c);
+const Allocator = mem.Allocator;
 
 const dev = @import("../dev.zig");
 const link = @import("../link.zig");
@@ -30,6 +31,35 @@ pub fn legalizeFeatures(_: *const std.Target) ?*const Air.Legalize.Features {
     }) else null; // we don't currently ask zig1 to use safe optimization modes
 }
 
+/// For most backends, MIR is basically a sequence of machine code instructions, perhaps with some
+/// "pseudo instructions" thrown in. For the C backend, it is instead the generated C code for a
+/// single function. We also need to track some information to get merged into the global `link.C`
+/// state, including:
+/// * The UAVs used, so declarations can be emitted in `flush`
+/// * The types used, so declarations can be emitted in `flush`
+/// * The lazy functions used, so definitions can be emitted in `flush`
+pub const Mir = struct {
+    /// This map contains all the UAVs we saw generating this function.
+    /// `link.C` will merge them into its `uavs`/`aligned_uavs` fields.
+    /// Key is the value of the UAV; value is the UAV's alignment, or
+    /// `.none` for natural alignment. The specified alignment is never
+    /// less than the natural alignment.
+    uavs: std.AutoArrayHashMapUnmanaged(InternPool.Index, Alignment),
+    // These remaining fields are essentially just an owned version of `link.C.AvBlock`.
+    code: []u8,
+    fwd_decl: []u8,
+    ctype_pool: CType.Pool,
+    lazy_fns: LazyFnMap,
+
+    pub fn deinit(mir: *Mir, gpa: Allocator) void {
+        mir.uavs.deinit(gpa);
+        gpa.free(mir.code);
+        gpa.free(mir.fwd_decl);
+        mir.ctype_pool.deinit(gpa);
+        mir.lazy_fns.deinit(gpa);
+    }
+};
+
 pub const CType = @import("c/Type.zig");
 
 pub const CValue = union(enum) {
@@ -671,7 +701,7 @@ pub const Object = struct {
 
 /// This data is available both when outputting .c code and when outputting an .h file.
 pub const DeclGen = struct {
-    gpa: mem.Allocator,
+    gpa: Allocator,
     pt: Zcu.PerThread,
     mod: *Module,
     pass: Pass,
@@ -682,10 +712,12 @@ pub const DeclGen = struct {
     error_msg: ?*Zcu.ErrorMsg,
     ctype_pool: CType.Pool,
     scratch: std.ArrayListUnmanaged(u32),
-    /// Keeps track of anonymous decls that need to be rendered before this
-    /// (named) Decl in the output C code.
-    uav_deps: std.AutoArrayHashMapUnmanaged(InternPool.Index, C.AvBlock),
-    aligned_uavs: std.AutoArrayHashMapUnmanaged(InternPool.Index, Alignment),
+    /// This map contains all the UAVs we saw generating this function.
+    /// `link.C` will merge them into its `uavs`/`aligned_uavs` fields.
+    /// Key is the value of the UAV; value is the UAV's alignment, or
+    /// `.none` for natural alignment. The specified alignment is never
+    /// less than the natural alignment.
+    uavs: std.AutoArrayHashMapUnmanaged(InternPool.Index, Alignment),
 
     pub const Pass = union(enum) {
         nav: InternPool.Nav.Index,
@@ -753,21 +785,17 @@ pub const DeclGen = struct {
         // Indicate that the anon decl should be rendered to the output so that
         // our reference above is not undefined.
         const ptr_type = ip.indexToKey(uav.orig_ty).ptr_type;
-        const gop = try dg.uav_deps.getOrPut(dg.gpa, uav.val);
-        if (!gop.found_existing) gop.value_ptr.* = .{};
-
-        // Only insert an alignment entry if the alignment is greater than ABI
-        // alignment. If there is already an entry, keep the greater alignment.
-        const explicit_alignment = ptr_type.flags.alignment;
-        if (explicit_alignment != .none) {
-            const abi_alignment = Type.fromInterned(ptr_type.child).abiAlignment(zcu);
-            if (explicit_alignment.order(abi_alignment).compare(.gt)) {
-                const aligned_gop = try dg.aligned_uavs.getOrPut(dg.gpa, uav.val);
-                aligned_gop.value_ptr.* = if (aligned_gop.found_existing)
-                    aligned_gop.value_ptr.maxStrict(explicit_alignment)
-                else
-                    explicit_alignment;
-            }
+        const gop = try dg.uavs.getOrPut(dg.gpa, uav.val);
+        if (!gop.found_existing) gop.value_ptr.* = .none;
+        // If there is an explicit alignment, greater than the current one, use it.
+        // Note that we intentionally start at `.none`, so `gop.value_ptr.*` is never
+        // underaligned, so we don't need to worry about the `.none` case here.
+        if (ptr_type.flags.alignment != .none) {
+            // Resolve the current alignment so we can choose the bigger one.
+            const cur_alignment: Alignment = if (gop.value_ptr.* == .none) abi: {
+                break :abi Type.fromInterned(ptr_type.child).abiAlignment(zcu);
+            } else gop.value_ptr.*;
+            gop.value_ptr.* = cur_alignment.maxStrict(ptr_type.flags.alignment);
         }
     }
 
@@ -2895,7 +2923,79 @@ pub fn genLazyFn(o: *Object, lazy_ctype_pool: *const CType.Pool, lazy_fn: LazyFn
     }
 }
 
-pub fn genFunc(f: *Function) !void {
+pub fn generate(
+    lf: *link.File,
+    pt: Zcu.PerThread,
+    src_loc: Zcu.LazySrcLoc,
+    func_index: InternPool.Index,
+    air: *const Air,
+    liveness: *const Air.Liveness,
+) @import("../codegen.zig").CodeGenError!Mir {
+    const zcu = pt.zcu;
+    const gpa = zcu.gpa;
+
+    _ = src_loc;
+    assert(lf.tag == .c);
+
+    const func = zcu.funcInfo(func_index);
+
+    var function: Function = .{
+        .value_map = .init(gpa),
+        .air = air.*,
+        .liveness = liveness.*,
+        .func_index = func_index,
+        .object = .{
+            .dg = .{
+                .gpa = gpa,
+                .pt = pt,
+                .mod = zcu.navFileScope(func.owner_nav).mod.?,
+                .error_msg = null,
+                .pass = .{ .nav = func.owner_nav },
+                .is_naked_fn = Type.fromInterned(func.ty).fnCallingConvention(zcu) == .naked,
+                .expected_block = null,
+                .fwd_decl = .init(gpa),
+                .ctype_pool = .empty,
+                .scratch = .empty,
+                .uavs = .empty,
+            },
+            .code = .init(gpa),
+            .indent_writer = undefined, // set later so we can get a pointer to object.code
+        },
+        .lazy_fns = .empty,
+    };
+    defer {
+        function.object.code.deinit();
+        function.object.dg.fwd_decl.deinit();
+        function.object.dg.ctype_pool.deinit(gpa);
+        function.object.dg.scratch.deinit(gpa);
+        function.object.dg.uavs.deinit(gpa);
+        function.deinit();
+    }
+    try function.object.dg.ctype_pool.init(gpa);
+    function.object.indent_writer = .{ .underlying_writer = function.object.code.writer() };
+
+    genFunc(&function) catch |err| switch (err) {
+        error.AnalysisFail => return zcu.codegenFailMsg(func.owner_nav, function.object.dg.error_msg.?),
+        error.OutOfMemory => |e| return e,
+    };
+
+    var mir: Mir = .{
+        .uavs = .empty,
+        .code = &.{},
+        .fwd_decl = &.{},
+        .ctype_pool = .empty,
+        .lazy_fns = .empty,
+    };
+    errdefer mir.deinit(gpa);
+    mir.uavs = function.object.dg.uavs.move();
+    mir.code = try function.object.code.toOwnedSlice();
+    mir.fwd_decl = try function.object.dg.fwd_decl.toOwnedSlice();
+    mir.ctype_pool = function.object.dg.ctype_pool.move();
+    mir.lazy_fns = function.lazy_fns.move();
+    return mir;
+}
+
+fn genFunc(f: *Function) !void {
     const tracy = trace(@src());
     defer tracy.end();
 
@@ -8482,7 +8582,7 @@ fn iterateBigTomb(f: *Function, inst: Air.Inst.Index) BigTomb {
 
 /// A naive clone of this map would create copies of the ArrayList which is
 /// stored in the values. This function additionally clones the values.
-fn cloneFreeLocalsMap(gpa: mem.Allocator, map: *LocalsMap) !LocalsMap {
+fn cloneFreeLocalsMap(gpa: Allocator, map: *LocalsMap) !LocalsMap {
     var cloned = try map.clone(gpa);
     const values = cloned.values();
     var i: usize = 0;
@@ -8499,7 +8599,7 @@ fn cloneFreeLocalsMap(gpa: mem.Allocator, map: *LocalsMap) !LocalsMap {
     return cloned;
 }
 
-fn deinitFreeLocalsMap(gpa: mem.Allocator, map: *LocalsMap) void {
+fn deinitFreeLocalsMap(gpa: Allocator, map: *LocalsMap) void {
     for (map.values()) |*value| {
         value.deinit(gpa);
     }

@@ -1121,8 +1121,8 @@ pub const Object = struct {
         o: *Object,
         pt: Zcu.PerThread,
         func_index: InternPool.Index,
-        air: Air,
-        liveness: Air.Liveness,
+        air: *const Air,
+        liveness: *const Air.Liveness,
     ) !void {
         assert(std.meta.eql(pt, o.pt));
         const zcu = pt.zcu;
@@ -1479,8 +1479,8 @@ pub const Object = struct {
 
         var fg: FuncGen = .{
             .gpa = gpa,
-            .air = air,
-            .liveness = liveness,
+            .air = air.*,
+            .liveness = liveness.*,
             .ng = &ng,
             .wip = wip,
             .is_naked = fn_info.cc == .naked,
@@ -1506,10 +1506,9 @@ pub const Object = struct {
         deinit_wip = false;
 
         fg.genBody(air.getMainBody(), .poi) catch |err| switch (err) {
-            error.CodegenFail => {
-                try zcu.failed_codegen.put(gpa, func.owner_nav, ng.err_msg.?);
-                ng.err_msg = null;
-                return;
+            error.CodegenFail => switch (zcu.codegenFailMsg(func.owner_nav, ng.err_msg.?)) {
+                error.CodegenFail => return,
+                error.OutOfMemory => |e| return e,
             },
             else => |e| return e,
         };
@@ -1561,10 +1560,9 @@ pub const Object = struct {
             .err_msg = null,
         };
         ng.genDecl() catch |err| switch (err) {
-            error.CodegenFail => {
-                try pt.zcu.failed_codegen.put(pt.zcu.gpa, nav_index, ng.err_msg.?);
-                ng.err_msg = null;
-                return;
+            error.CodegenFail => switch (pt.zcu.codegenFailMsg(nav_index, ng.err_msg.?)) {
+                error.CodegenFail => return,
+                error.OutOfMemory => |e| return e,
             },
             else => |e| return e,
         };

@@ -230,8 +230,9 @@ pub const Object = struct {
         defer nav_gen.deinit();
 
         nav_gen.genNav(do_codegen) catch |err| switch (err) {
-            error.CodegenFail => {
-                try zcu.failed_codegen.put(gpa, nav_index, nav_gen.error_msg.?);
+            error.CodegenFail => switch (zcu.codegenFailMsg(nav_index, nav_gen.error_msg.?)) {
+                error.CodegenFail => {},
+                error.OutOfMemory => |e| return e,
             },
             else => |other| {
                 // There might be an error that happened *after* self.error_msg

@@ -1004,7 +1004,7 @@ fn queueSharedObjects(comp: *Compilation, so_files: BuiltSharedObjects) void {
         }
     }
 
-    comp.queueLinkTasks(task_buffer[0..task_buffer_i]);
+    comp.queuePrelinkTasks(task_buffer[0..task_buffer_i]);
 }
 
 fn buildSharedLib(

@@ -1170,7 +1170,7 @@ fn queueSharedObjects(comp: *Compilation, so_files: BuiltSharedObjects) void {
         }
     }
 
-    comp.queueLinkTasks(task_buffer[0..task_buffer_i]);
+    comp.queuePrelinkTasks(task_buffer[0..task_buffer_i]);
 }
 
 fn buildSharedLib(

@@ -308,7 +308,7 @@ pub fn buildLibCxx(comp: *Compilation, prog_node: std.Progress.Node) BuildError!
     assert(comp.libcxx_static_lib == null);
     const crt_file = try sub_compilation.toCrtFile();
     comp.libcxx_static_lib = crt_file;
-    comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+    comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
 }
 
 pub fn buildLibCxxAbi(comp: *Compilation, prog_node: std.Progress.Node) BuildError!void {
@@ -504,7 +504,7 @@ pub fn buildLibCxxAbi(comp: *Compilation, prog_node: std.Progress.Node) BuildErr
     assert(comp.libcxxabi_static_lib == null);
     const crt_file = try sub_compilation.toCrtFile();
     comp.libcxxabi_static_lib = crt_file;
-    comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+    comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
 }
 
 pub fn addCxxArgs(

@@ -325,7 +325,7 @@ pub fn buildTsan(comp: *Compilation, prog_node: std.Progress.Node) BuildError!vo
     };
 
     const crt_file = try sub_compilation.toCrtFile();
-    comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+    comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
     assert(comp.tsan_lib == null);
     comp.tsan_lib = crt_file;
 }

@@ -195,7 +195,7 @@ pub fn buildStaticLib(comp: *Compilation, prog_node: std.Progress.Node) BuildErr
     };
 
     const crt_file = try sub_compilation.toCrtFile();
-    comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+    comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
     assert(comp.libunwind_static_lib == null);
     comp.libunwind_static_lib = crt_file;
 }

@@ -278,7 +278,7 @@ pub fn buildCrtFile(comp: *Compilation, in_crt_file: CrtFile, prog_node: std.Pro
             errdefer comp.gpa.free(basename);
 
             const crt_file = try sub_compilation.toCrtFile();
-            comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+            comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
             {
                 comp.mutex.lock();
                 defer comp.mutex.unlock();

@@ -669,7 +669,7 @@ fn queueSharedObjects(comp: *Compilation, so_files: BuiltSharedObjects) void {
         }
     }
 
-    comp.queueLinkTasks(task_buffer[0..task_buffer_i]);
+    comp.queuePrelinkTasks(task_buffer[0..task_buffer_i]);
 }
 
 fn buildSharedLib(

@@ -1416,8 +1416,10 @@ pub fn updateFunc(
     elf_file: *Elf,
     pt: Zcu.PerThread,
     func_index: InternPool.Index,
-    air: Air,
-    liveness: Air.Liveness,
+    mir: *const codegen.AnyMir,
+    /// This may be `undefined`; only pass it to `emitFunction`.
+    /// This parameter will eventually be removed.
+    maybe_undef_air: *const Air,
 ) link.File.UpdateNavError!void {
     const tracy = trace(@src());
     defer tracy.end();
@@ -1438,15 +1440,15 @@ pub fn updateFunc(
     var debug_wip_nav = if (self.dwarf) |*dwarf| try dwarf.initWipNav(pt, func.owner_nav, sym_index) else null;
     defer if (debug_wip_nav) |*wip_nav| wip_nav.deinit();
 
-    try codegen.generateFunction(
+    try codegen.emitFunction(
         &elf_file.base,
         pt,
         zcu.navSrcLoc(func.owner_nav),
         func_index,
-        air,
-        liveness,
+        mir,
         &code_buffer,
         if (debug_wip_nav) |*dn| .{ .dwarf = dn } else .none,
+        maybe_undef_air,
     );
     const code = code_buffer.items;

@@ -18,6 +18,7 @@ const trace = @import("../tracy.zig").trace;
 const Type = @import("../Type.zig");
 const Value = @import("../Value.zig");
 const Air = @import("../Air.zig");
+const AnyMir = @import("../codegen.zig").AnyMir;
 
 pub const zig_h = "#include \"zig.h\"\n";
 
@@ -166,6 +167,9 @@ pub fn deinit(self: *C) void {
     self.uavs.deinit(gpa);
     self.aligned_uavs.deinit(gpa);
 
+    self.exported_navs.deinit(gpa);
+    self.exported_uavs.deinit(gpa);
+
     self.string_bytes.deinit(gpa);
     self.fwd_decl_buf.deinit(gpa);
     self.code_buf.deinit(gpa);
@@ -177,73 +181,28 @@ pub fn updateFunc(
     self: *C,
     pt: Zcu.PerThread,
     func_index: InternPool.Index,
-    air: Air,
-    liveness: Air.Liveness,
+    mir: *AnyMir,
+    /// This may be `undefined`; only pass it to `emitFunction`.
+    /// This parameter will eventually be removed.
+    maybe_undef_air: *const Air,
 ) link.File.UpdateNavError!void {
+    _ = maybe_undef_air; // It would be a bug to use this argument.
+
     const zcu = pt.zcu;
     const gpa = zcu.gpa;
     const func = zcu.funcInfo(func_index);
-    const gop = try self.navs.getOrPut(gpa, func.owner_nav);
-    if (!gop.found_existing) gop.value_ptr.* = .{};
-    const ctype_pool = &gop.value_ptr.ctype_pool;
-    const lazy_fns = &gop.value_ptr.lazy_fns;
-    const fwd_decl = &self.fwd_decl_buf;
-    const code = &self.code_buf;
-    try ctype_pool.init(gpa);
-    ctype_pool.clearRetainingCapacity();
-    lazy_fns.clearRetainingCapacity();
-    fwd_decl.clearRetainingCapacity();
-    code.clearRetainingCapacity();
 
-    var function: codegen.Function = .{
-        .value_map = codegen.CValueMap.init(gpa),
-        .air = air,
-        .liveness = liveness,
-        .func_index = func_index,
-        .object = .{
-            .dg = .{
-                .gpa = gpa,
-                .pt = pt,
-                .mod = zcu.navFileScope(func.owner_nav).mod.?,
-                .error_msg = null,
-                .pass = .{ .nav = func.owner_nav },
-                .is_naked_fn = Type.fromInterned(func.ty).fnCallingConvention(zcu) == .naked,
-                .expected_block = null,
-                .fwd_decl = fwd_decl.toManaged(gpa),
-                .ctype_pool = ctype_pool.*,
-                .scratch = .{},
-                .uav_deps = self.uavs,
-                .aligned_uavs = self.aligned_uavs,
-            },
-            .code = code.toManaged(gpa),
-            .indent_writer = undefined, // set later so we can get a pointer to object.code
-        },
-        .lazy_fns = lazy_fns.*,
-    };
-    function.object.indent_writer = .{ .underlying_writer = function.object.code.writer() };
-    defer {
-        self.uavs = function.object.dg.uav_deps;
-        self.aligned_uavs = function.object.dg.aligned_uavs;
-        fwd_decl.* = function.object.dg.fwd_decl.moveToUnmanaged();
-        ctype_pool.* = function.object.dg.ctype_pool.move();
-        ctype_pool.freeUnusedCapacity(gpa);
-        function.object.dg.scratch.deinit(gpa);
-        lazy_fns.* = function.lazy_fns.move();
-        lazy_fns.shrinkAndFree(gpa, lazy_fns.count());
-        code.* = function.object.code.moveToUnmanaged();
-        function.deinit();
-    }
-
-    try zcu.failed_codegen.ensureUnusedCapacity(gpa, 1);
-    codegen.genFunc(&function) catch |err| switch (err) {
-        error.AnalysisFail => {
-            zcu.failed_codegen.putAssumeCapacityNoClobber(func.owner_nav, function.object.dg.error_msg.?);
-            return;
-        },
-        else => |e| return e,
+    const gop = try self.navs.getOrPut(gpa, func.owner_nav);
+    if (gop.found_existing) gop.value_ptr.deinit(gpa);
+    gop.value_ptr.* = .{
+        .code = .empty,
+        .fwd_decl = .empty,
+        .ctype_pool = mir.c.ctype_pool.move(),
+        .lazy_fns = mir.c.lazy_fns.move(),
     };
-    gop.value_ptr.fwd_decl = try self.addString(function.object.dg.fwd_decl.items);
-    gop.value_ptr.code = try self.addString(function.object.code.items);
+    gop.value_ptr.code = try self.addString(mir.c.code);
+    gop.value_ptr.fwd_decl = try self.addString(mir.c.fwd_decl);
+    try self.addUavsFromCodegen(&mir.c.uavs);
 }
 
 fn updateUav(self: *C, pt: Zcu.PerThread, i: usize) !void {
@@ -267,16 +226,14 @@ fn updateUav(self: *C, pt: Zcu.PerThread, i: usize) !void {
             .fwd_decl = fwd_decl.toManaged(gpa),
             .ctype_pool = codegen.CType.Pool.empty,
             .scratch = .{},
-            .uav_deps = self.uavs,
-            .aligned_uavs = self.aligned_uavs,
+            .uavs = .empty,
         },
         .code = code.toManaged(gpa),
         .indent_writer = undefined, // set later so we can get a pointer to object.code
     };
     object.indent_writer = .{ .underlying_writer = object.code.writer() };
     defer {
-        self.uavs = object.dg.uav_deps;
-        self.aligned_uavs = object.dg.aligned_uavs;
+        object.dg.uavs.deinit(gpa);
         fwd_decl.* = object.dg.fwd_decl.moveToUnmanaged();
         object.dg.ctype_pool.deinit(object.dg.gpa);
         object.dg.scratch.deinit(gpa);
@@ -295,8 +252,10 @@ fn updateUav(self: *C, pt: Zcu.PerThread, i: usize) !void {
         else => |e| return e,
     };
 
+    try self.addUavsFromCodegen(&object.dg.uavs);
+
     object.dg.ctype_pool.freeUnusedCapacity(gpa);
-    object.dg.uav_deps.values()[i] = .{
+    self.uavs.values()[i] = .{
         .code = try self.addString(object.code.items),
         .fwd_decl = try self.addString(object.dg.fwd_decl.items),
         .ctype_pool = object.dg.ctype_pool.move(),
@@ -343,16 +302,14 @@ pub fn updateNav(self: *C, pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) l
             .fwd_decl = fwd_decl.toManaged(gpa),
             .ctype_pool = ctype_pool.*,
             .scratch = .{},
-            .uav_deps = self.uavs,
-            .aligned_uavs = self.aligned_uavs,
+            .uavs = .empty,
         },
         .code = code.toManaged(gpa),
         .indent_writer = undefined, // set later so we can get a pointer to object.code
     };
     object.indent_writer = .{ .underlying_writer = object.code.writer() };
     defer {
-        self.uavs = object.dg.uav_deps;
-        self.aligned_uavs = object.dg.aligned_uavs;
+        object.dg.uavs.deinit(gpa);
         fwd_decl.* = object.dg.fwd_decl.moveToUnmanaged();
         ctype_pool.* = object.dg.ctype_pool.move();
         ctype_pool.freeUnusedCapacity(gpa);
@@ -360,16 +317,16 @@ pub fn updateNav(self: *C, pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) l
         code.* = object.code.moveToUnmanaged();
     }
 
-    try zcu.failed_codegen.ensureUnusedCapacity(gpa, 1);
     codegen.genDecl(&object) catch |err| switch (err) {
-        error.AnalysisFail => {
-            zcu.failed_codegen.putAssumeCapacityNoClobber(nav_index, object.dg.error_msg.?);
-            return;
+        error.AnalysisFail => switch (zcu.codegenFailMsg(nav_index, object.dg.error_msg.?)) {
+            error.CodegenFail => return,
+            error.OutOfMemory => |e| return e,
         },
         else => |e| return e,
     };
     gop.value_ptr.code = try self.addString(object.code.items);
     gop.value_ptr.fwd_decl = try self.addString(object.dg.fwd_decl.items);
+    try self.addUavsFromCodegen(&object.dg.uavs);
 }
 
 pub fn updateLineNumber(self: *C, pt: Zcu.PerThread, ti_id: InternPool.TrackedInst.Index) !void {
@@ -671,16 +628,14 @@ fn flushErrDecls(self: *C, pt: Zcu.PerThread, ctype_pool: *codegen.CType.Pool) F
             .fwd_decl = fwd_decl.toManaged(gpa),
             .ctype_pool = ctype_pool.*,
             .scratch = .{},
-            .uav_deps = self.uavs,
-            .aligned_uavs = self.aligned_uavs,
+            .uavs = .empty,
         },
         .code = code.toManaged(gpa),
         .indent_writer = undefined, // set later so we can get a pointer to object.code
     };
     object.indent_writer = .{ .underlying_writer = object.code.writer() };
     defer {
-        self.uavs = object.dg.uav_deps;
-        self.aligned_uavs = object.dg.aligned_uavs;
+        object.dg.uavs.deinit(gpa);
         fwd_decl.* = object.dg.fwd_decl.moveToUnmanaged();
         ctype_pool.* = object.dg.ctype_pool.move();
         ctype_pool.freeUnusedCapacity(gpa);
@@ -692,6 +647,8 @@ fn flushErrDecls(self: *C, pt: Zcu.PerThread, ctype_pool: *codegen.CType.Pool) F
         error.AnalysisFail => unreachable,
         else => |e| return e,
     };
+
+    try self.addUavsFromCodegen(&object.dg.uavs);
 }
 
 fn flushLazyFn(
@@ -719,8 +676,7 @@ fn flushLazyFn(
             .fwd_decl = fwd_decl.toManaged(gpa),
             .ctype_pool = ctype_pool.*,
             .scratch = .{},
-            .uav_deps = .{},
-            .aligned_uavs = .{},
+            .uavs = .empty,
         },
         .code = code.toManaged(gpa),
         .indent_writer = undefined, // set later so we can get a pointer to object.code
@@ -729,8 +685,7 @@ fn flushLazyFn(
     defer {
         // If this assert trips just handle the anon_decl_deps the same as
         // `updateFunc()` does.
-        assert(object.dg.uav_deps.count() == 0);
-        assert(object.dg.aligned_uavs.count() == 0);
+        assert(object.dg.uavs.count() == 0);
         fwd_decl.* = object.dg.fwd_decl.moveToUnmanaged();
         ctype_pool.* = object.dg.ctype_pool.move();
         ctype_pool.freeUnusedCapacity(gpa);
@@ -866,12 +821,10 @@ pub fn updateExports(
         .fwd_decl = fwd_decl.toManaged(gpa),
         .ctype_pool = decl_block.ctype_pool,
         .scratch = .{},
-        .uav_deps = .{},
-        .aligned_uavs = .{},
+        .uavs = .empty,
     };
     defer {
-        assert(dg.uav_deps.count() == 0);
-        assert(dg.aligned_uavs.count() == 0);
+        assert(dg.uavs.count() == 0);
         fwd_decl.* = dg.fwd_decl.moveToUnmanaged();
         ctype_pool.* = dg.ctype_pool.move();
         ctype_pool.freeUnusedCapacity(gpa);
@@ -891,3 +844,21 @@ pub fn deleteExport(
         .uav => |uav| _ = self.exported_uavs.swapRemove(uav),
     }
 }
+
+fn addUavsFromCodegen(c: *C, uavs: *const std.AutoArrayHashMapUnmanaged(InternPool.Index, Alignment)) Allocator.Error!void {
+    const gpa = c.base.comp.gpa;
+    try c.uavs.ensureUnusedCapacity(gpa, uavs.count());
+    try c.aligned_uavs.ensureUnusedCapacity(gpa, uavs.count());
+    for (uavs.keys(), uavs.values()) |uav_val, uav_align| {
+        {
+            const gop = c.uavs.getOrPutAssumeCapacity(uav_val);
+            if (!gop.found_existing) gop.value_ptr.* = .{};
+        }
+        if (uav_align != .none) {
+            const gop = c.aligned_uavs.getOrPutAssumeCapacity(uav_val);
+            gop.value_ptr.* = if (gop.found_existing) max: {
+                break :max gop.value_ptr.*.maxStrict(uav_align);
+            } else uav_align;
+        }
+    }
+}

@@ -1079,7 +1079,7 @@ pub fn updateFunc(
     var code_buffer: std.ArrayListUnmanaged(u8) = .empty;
     defer code_buffer.deinit(gpa);
 
-    codegen.generateFunction(
+    try codegen.generateFunction(
         &coff.base,
         pt,
         zcu.navSrcLoc(nav_index),
@@ -1088,20 +1088,7 @@ pub fn updateFunc(
         liveness,
         &code_buffer,
         .none,
-    ) catch |err| switch (err) {
-        error.CodegenFail => return error.CodegenFail,
-        error.OutOfMemory => return error.OutOfMemory,
-        error.Overflow, error.RelocationNotByteAligned => |e| {
-            try zcu.failed_codegen.putNoClobber(gpa, nav_index, try Zcu.ErrorMsg.create(
-                gpa,
-                zcu.navSrcLoc(nav_index),
-                "unable to codegen: {s}",
-                .{@errorName(e)},
-            ));
-            try zcu.retryable_failures.append(zcu.gpa, AnalUnit.wrap(.{ .func = func_index }));
-            return error.CodegenFail;
-        },
-    };
+    );
 
     try coff.updateNavCode(pt, nav_index, code_buffer.items, .FUNCTION);

@@ -1691,13 +1691,13 @@ pub fn updateFunc(
     self: *Elf,
     pt: Zcu.PerThread,
     func_index: InternPool.Index,
-    air: Air,
-    liveness: Air.Liveness,
+    mir: *const codegen.AnyMir,
+    maybe_undef_air: *const Air,
 ) link.File.UpdateNavError!void {
     if (build_options.skip_non_native and builtin.object_format != .elf) {
         @panic("Attempted to compile for object format that was disabled by build configuration");
     }
-    return self.zigObjectPtr().?.updateFunc(self, pt, func_index, air, liveness);
+    return self.zigObjectPtr().?.updateFunc(self, pt, func_index, mir, maybe_undef_air);
 }
 
 pub fn updateNav(

@@ -0,0 +1,234 @@
+//! Stores and manages the queue of link tasks. Each task is either a `PrelinkTask` or a `ZcuTask`.
+//!
+//! There must be at most one link thread (the thread processing these tasks) active at a time. If
+//! `!comp.separateCodegenThreadOk()`, then ZCU tasks will be run on the main thread, bypassing this
+//! queue entirely.
+//!
+//! All prelink tasks must be processed before any ZCU tasks are processed. After all prelink tasks
+//! are run, but before any ZCU tasks are run, `prelink` must be called on the `link.File`.
+//!
+//! There will sometimes be a `ZcuTask` in the queue which is not yet ready because it depends on
+//! MIR which has not yet been generated by any codegen thread. In this case, we must pause
+//! processing of linker tasks until the MIR is ready. It would be incorrect to run any other link
+//! tasks first, since this would make builds unreproducible.
+
+mutex: std.Thread.Mutex,
+/// Validates that only one `flushTaskQueue` thread is running at a time.
+flush_safety: std.debug.SafetyLock,
+
+/// This is the number of prelink tasks which are expected but have not yet been enqueued.
+/// Guarded by `mutex`.
+pending_prelink_tasks: u32,
+
+/// Prelink tasks which have been enqueued and are not yet owned by the worker thread.
+/// Allocated into `gpa`, guarded by `mutex`.
+queued_prelink: std.ArrayListUnmanaged(PrelinkTask),
+/// The worker thread moves items from `queued_prelink` into this array in order to process them.
+/// Allocated into `gpa`, accessed only by the worker thread.
+wip_prelink: std.ArrayListUnmanaged(PrelinkTask),
+
+/// Like `queued_prelink`, but for ZCU tasks.
+/// Allocated into `gpa`, guarded by `mutex`.
+queued_zcu: std.ArrayListUnmanaged(ZcuTask),
+/// Like `wip_prelink`, but for ZCU tasks.
+/// Allocated into `gpa`, accessed only by the worker thread.
+wip_zcu: std.ArrayListUnmanaged(ZcuTask),
+
+/// When processing ZCU link tasks, we might have to block due to unpopulated MIR. When this
+/// happens, some tasks in `wip_zcu` have been run, and some are still pending. This is the
+/// index into `wip_zcu` which we have reached.
+wip_zcu_idx: usize,
+
+/// Guarded by `mutex`.
+state: union(enum) {
+    /// The link thread is currently running or queued to run.
+    running,
+    /// The link thread is not running or queued, because it has exhausted all immediately available
+    /// tasks. It should be spawned when more tasks are enqueued. If `pending_prelink_tasks` is not
+    /// zero, we are specifically waiting for prelink tasks.
+    finished,
+    /// The link thread is not running or queued, because it is waiting for this MIR to be populated.
+    /// Once codegen completes, it must call `mirReady` which will restart the link thread.
+    wait_for_mir: *ZcuTask.LinkFunc.SharedMir,
+},
+
+/// The initial `Queue` state, containing no tasks, expecting no prelink tasks, and with no running worker thread.
+/// The `pending_prelink_tasks` and `queued_prelink` fields may be modified as needed before calling `start`.
+pub const empty: Queue = .{
+    .mutex = .{},
+    .flush_safety = .{},
+    .pending_prelink_tasks = 0,
+    .queued_prelink = .empty,
+    .wip_prelink = .empty,
+    .queued_zcu = .empty,
+    .wip_zcu = .empty,
+    .wip_zcu_idx = 0,
+    .state = .finished,
+};
+/// `lf` is needed to correctly deinit any pending `ZcuTask`s.
+pub fn deinit(q: *Queue, comp: *Compilation) void {
+    const gpa = comp.gpa;
+    for (q.queued_zcu.items) |t| t.deinit(comp.zcu.?);
+    for (q.wip_zcu.items[q.wip_zcu_idx..]) |t| t.deinit(comp.zcu.?);
+    q.queued_prelink.deinit(gpa);
+    q.wip_prelink.deinit(gpa);
+    q.queued_zcu.deinit(gpa);
+    q.wip_zcu.deinit(gpa);
+}
+
+/// This is expected to be called exactly once, after which the caller must not directly access
+/// `queued_prelink` or `pending_prelink_tasks` any longer. This will spawn the link thread if
+/// necessary.
+pub fn start(q: *Queue, comp: *Compilation) void {
+    assert(q.state == .finished);
+    assert(q.queued_zcu.items.len == 0);
+    if (q.queued_prelink.items.len != 0) {
+        q.state = .running;
+        comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
+    }
+}
+
+/// Called by codegen workers after they have populated a `ZcuTask.LinkFunc.SharedMir`. If the link
+/// thread was waiting for this MIR, it can resume.
+pub fn mirReady(q: *Queue, comp: *Compilation, mir: *ZcuTask.LinkFunc.SharedMir) void {
+    // We would like to assert that `mir` is not pending, but that would race with a worker thread
+    // potentially freeing it.
+    {
+        q.mutex.lock();
+        defer q.mutex.unlock();
+        switch (q.state) {
+            .finished => unreachable, // there's definitely a task queued
+            .running => return,
+            .wait_for_mir => |wait_for| if (wait_for != mir) return,
+        }
+        // We were waiting for `mir`, so we will restart the linker thread.
+        q.state = .running;
+    }
+    assert(mir.status.load(.monotonic) != .pending);
+    comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
+}
+
+/// Enqueues all prelink tasks in `tasks`. Asserts that they were expected, i.e. that `tasks.len` is
+/// less than or equal to `q.pending_prelink_tasks`. Also asserts that `tasks.len` is not 0.
+pub fn enqueuePrelink(q: *Queue, comp: *Compilation, tasks: []const PrelinkTask) Allocator.Error!void {
+    {
+        q.mutex.lock();
+        defer q.mutex.unlock();
+        try q.queued_prelink.appendSlice(comp.gpa, tasks);
+        q.pending_prelink_tasks -= @intCast(tasks.len);
+        switch (q.state) {
+            .wait_for_mir => unreachable, // we've not started zcu tasks yet
+            .running => return,
+            .finished => {},
+        }
+        // Restart the linker thread, because it was waiting for a task
+        q.state = .running;
+    }
+    comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
+}
+
+pub fn enqueueZcu(q: *Queue, comp: *Compilation, task: ZcuTask) Allocator.Error!void {
+    assert(comp.separateCodegenThreadOk());
+    {
+        q.mutex.lock();
+        defer q.mutex.unlock();
+        try q.queued_zcu.append(comp.gpa, task);
+        switch (q.state) {
+            .running, .wait_for_mir => return,
+            .finished => if (q.pending_prelink_tasks != 0) return,
+        }
+        // Restart the linker thread, unless it would immediately be blocked
+        if (task == .link_func and task.link_func.mir.status.load(.monotonic) == .pending) {
+            q.state = .{ .wait_for_mir = task.link_func.mir };
+            return;
+        }
+        q.state = .running;
+    }
+    comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
+}
+
+fn flushTaskQueue(tid: usize, q: *Queue, comp: *Compilation) void {
+    q.flush_safety.lock();
+    defer q.flush_safety.unlock();
+
+    if (std.debug.runtime_safety) {
+        q.mutex.lock();
+        defer q.mutex.unlock();
+        assert(q.state == .running);
+    }
+    prelink: while (true) {
+        assert(q.wip_prelink.items.len == 0);
+        {
+            q.mutex.lock();
+            defer q.mutex.unlock();
+            std.mem.swap(std.ArrayListUnmanaged(PrelinkTask), &q.queued_prelink, &q.wip_prelink);
+            if (q.wip_prelink.items.len == 0) {
+                if (q.pending_prelink_tasks == 0) {
+                    break :prelink; // prelink is done
+                } else {
+                    // We're expecting more prelink tasks so can't move on to ZCU tasks.
+                    q.state = .finished;
+                    return;
+                }
+            }
+        }
+        for (q.wip_prelink.items) |task| {
+            link.doPrelinkTask(comp, task);
+        }
+        q.wip_prelink.clearRetainingCapacity();
+    }
+
+    // We've finished the prelink tasks, so run prelink if necessary.
+    if (comp.bin_file) |lf| {
+        if (!lf.post_prelink) {
+            if (lf.prelink(comp.work_queue_progress_node)) |_| {
+                lf.post_prelink = true;
+            } else |err| switch (err) {
+                error.OutOfMemory => comp.link_diags.setAllocFailure(),
+                error.LinkFailure => {},
+            }
+        }
+    }
+
+    // Now we can run ZCU tasks.
+    while (true) {
+        if (q.wip_zcu.items.len == q.wip_zcu_idx) {
+            q.wip_zcu.clearRetainingCapacity();
+            q.wip_zcu_idx = 0;
+            q.mutex.lock();
+            defer q.mutex.unlock();
+            std.mem.swap(std.ArrayListUnmanaged(ZcuTask), &q.queued_zcu, &q.wip_zcu);
+            if (q.wip_zcu.items.len == 0) {
+                // We've exhausted all available tasks.
+                q.state = .finished;
+                return;
+            }
+        }
+        const task = q.wip_zcu.items[q.wip_zcu_idx];
+        // If the task is a `link_func`, we might have to stop until its MIR is populated.
+        pending: {
+            if (task != .link_func) break :pending;
+            const status_ptr = &task.link_func.mir.status;
+            // First check without the mutex to optimize for the common case where MIR is ready.
+            if (status_ptr.load(.monotonic) != .pending) break :pending;
+            q.mutex.lock();
+            defer q.mutex.unlock();
+            if (status_ptr.load(.monotonic) != .pending) break :pending;
+            // We will stop for now, and get restarted once this MIR is ready.
+            q.state = .{ .wait_for_mir = task.link_func.mir };
+            return;
+        }
+        link.doZcuTask(comp, tid, task);
+        task.deinit(comp.zcu.?);
+        q.wip_zcu_idx += 1;
+    }
+}
+
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+const Compilation = @import("../Compilation.zig");
+const link = @import("../link.zig");
+const PrelinkTask = link.PrelinkTask;
+const ZcuTask = link.ZcuTask;
+const Queue = @This();

@@ -27,6 +27,7 @@ const Type = @import("../Type.zig");
 const Value = @import("../Value.zig");
 const Zcu = @import("../Zcu.zig");
 const Compilation = @import("../Compilation.zig");
+const codegen = @import("../codegen.zig");
 const Zir = std.zig.Zir;
 const Zoir = std.zig.Zoir;
 const ZonGen = std.zig.ZonGen;
@@ -1716,7 +1717,7 @@ fn analyzeFuncBody(
     }
 
     // This job depends on any resolve_type_fully jobs queued up before it.
-    try comp.queueJob(.{ .link_func = .{
+    try comp.queueJob(.{ .codegen_func = .{
         .func = func_index,
         .air = air,
     } });
@@ -1724,79 +1725,6 @@ fn analyzeFuncBody(
     return .{ .ies_outdated = ies_outdated };
 }
 
-/// Takes ownership of `air`, even on error.
-/// If any types referenced by `air` are unresolved, marks the codegen as failed.
-pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *Air) Allocator.Error!void {
-    const zcu = pt.zcu;
-    const gpa = zcu.gpa;
-    const ip = &zcu.intern_pool;
-    const comp = zcu.comp;
-
-    const func = zcu.funcInfo(func_index);
-    const nav_index = func.owner_nav;
-    const nav = ip.getNav(nav_index);
-
-    const codegen_prog_node = zcu.codegen_prog_node.start(nav.fqn.toSlice(ip), 0);
-    defer codegen_prog_node.end();
-
-    legalize: {
-        try air.legalize(pt, @import("../codegen.zig").legalizeFeatures(pt, nav_index) orelse break :legalize);
-    }
-
-    var liveness = try Air.Liveness.analyze(zcu, air.*, ip);
-    defer liveness.deinit(gpa);
-
-    if (build_options.enable_debug_extensions and comp.verbose_air) {
-        std.debug.print("# Begin Function AIR: {}:\n", .{nav.fqn.fmt(ip)});
-        air.dump(pt, liveness);
-        std.debug.print("# End Function AIR: {}\n\n", .{nav.fqn.fmt(ip)});
-    }
-
-    if (std.debug.runtime_safety) {
-        var verify: Air.Liveness.Verify = .{
-            .gpa = gpa,
-            .zcu = zcu,
-            .air = air.*,
-            .liveness = liveness,
-            .intern_pool = ip,
-        };
-        defer verify.deinit();
-
-        verify.verify() catch |err| switch (err) {
-            error.OutOfMemory => return error.OutOfMemory,
-            else => {
-                try zcu.failed_codegen.putNoClobber(gpa, nav_index, try Zcu.ErrorMsg.create(
-                    gpa,
-                    zcu.navSrcLoc(nav_index),
-                    "invalid liveness: {s}",
-                    .{@errorName(err)},
-                ));
-                return;
-            },
-        };
-    }
-
-    if (zcu.llvm_object) |llvm_object| {
-        llvm_object.updateFunc(pt, func_index, air.*, liveness) catch |err| switch (err) {
-            error.OutOfMemory => return error.OutOfMemory,
-        };
-    } else if (comp.bin_file) |lf| {
-        lf.updateFunc(pt, func_index, air, liveness) catch |err| switch (err) {
-            error.OutOfMemory => return error.OutOfMemory,
-            error.CodegenFail => assert(zcu.failed_codegen.contains(nav_index)),
-            error.Overflow, error.RelocationNotByteAligned => {
-                try zcu.failed_codegen.putNoClobber(gpa, nav_index, try Zcu.ErrorMsg.create(
-                    gpa,
-                    zcu.navSrcLoc(nav_index),
-                    "unable to codegen: {s}",
-                    .{@errorName(err)},
-                ));
-                // Not a retryable failure.
-            },
-        };
-    }
-}
-
 pub fn semaMod(pt: Zcu.PerThread, mod: *Module) !void {
     dev.check(.sema);
     const file_index = pt.zcu.module_roots.get(mod).?.unwrap().?;
@@ -3449,7 +3377,7 @@ pub fn populateTestFunctions(
         }
 
         // The linker thread is not running, so we actually need to dispatch this task directly.
-        @import("../link.zig").doTask(zcu.comp, @intFromEnum(pt.tid), .{ .link_nav = nav_index });
+        @import("../link.zig").doZcuTask(zcu.comp, @intFromEnum(pt.tid), .{ .link_nav = nav_index });
     }
 }
 
@@ -4442,3 +4370,87 @@ pub fn addDependency(pt: Zcu.PerThread, unit: AnalUnit, dependee: InternPool.Dep
         try info.deps.append(gpa, dependee);
     }
 }
+
+/// Performs code generation, which comes after `Sema` but before `link` in the pipeline.
+/// This part of the pipeline is self-contained/"pure", so can be run in parallel with most
+/// other code. This function is currently run either on the main thread, or on a separate
+/// codegen thread, depending on whether the backend supports `Zcu.Feature.separate_thread`.
+pub fn runCodegen(pt: Zcu.PerThread, func_index: InternPool.Index, air: *Air, out: *@import("../link.zig").ZcuTask.LinkFunc.SharedMir) void {
+    if (runCodegenInner(pt, func_index, air)) |mir| {
+        out.value = mir;
+        out.status.store(.ready, .release);
+    } else |err| switch (err) {
+        error.OutOfMemory => {
+            pt.zcu.comp.setAllocFailure();
+            out.status.store(.failed, .monotonic);
+        },
+        error.CodegenFail => {
+            pt.zcu.assertCodegenFailed(pt.zcu.funcInfo(func_index).owner_nav);
+            out.status.store(.failed, .monotonic);
+        },
+        error.NoLinkFile => {
+            assert(pt.zcu.comp.bin_file == null);
+            out.status.store(.failed, .monotonic);
+        },
+    }
+    pt.zcu.comp.link_task_queue.mirReady(pt.zcu.comp, out);
+}
+fn runCodegenInner(pt: Zcu.PerThread, func_index: InternPool.Index, air: *Air) error{ OutOfMemory, CodegenFail, NoLinkFile }!codegen.AnyMir {
+    const zcu = pt.zcu;
+    const gpa = zcu.gpa;
+    const ip = &zcu.intern_pool;
+    const comp = zcu.comp;
+
+    const nav = zcu.funcInfo(func_index).owner_nav;
+    const fqn = ip.getNav(nav).fqn;
+
+    const codegen_prog_node = zcu.codegen_prog_node.start(fqn.toSlice(ip), 0);
+    defer codegen_prog_node.end();
+
+    if (codegen.legalizeFeatures(pt, nav)) |features| {
+        try air.legalize(pt, features);
+    }
+
+    var liveness: Air.Liveness = try .analyze(zcu, air.*, ip);
+    defer liveness.deinit(gpa);
+
+    // TODO: surely writing to stderr from n threads simultaneously will work flawlessly
+    if (build_options.enable_debug_extensions and comp.verbose_air) {
+        std.debug.print("# Begin Function AIR: {}:\n", .{fqn.fmt(ip)});
+        air.dump(pt, liveness);
+        std.debug.print("# End Function AIR: {}\n\n", .{fqn.fmt(ip)});
+    }
+
+    if (std.debug.runtime_safety) {
+        var verify: Air.Liveness.Verify = .{
+            .gpa = gpa,
+            .zcu = zcu,
+            .air = air.*,
+            .liveness = liveness,
+            .intern_pool = ip,
+        };
+        defer verify.deinit();
+
+        verify.verify() catch |err| switch (err) {
+            error.OutOfMemory => return error.OutOfMemory,
+            else => return zcu.codegenFail(nav, "invalid liveness: {s}", .{@errorName(err)}),
+        };
+    }
+
+    // The LLVM backend is special, because we only need to do codegen. There is no equivalent to the
+    // "emit" step because LLVM does not support incremental linking. Our linker (LLD or self-hosted)
+    // will just see the ZCU object file which LLVM ultimately emits.
+    if (zcu.llvm_object) |llvm_object| {
+        return llvm_object.updateFunc(pt, func_index, air, &liveness);
+    }
+
+    const lf = comp.bin_file orelse return error.NoLinkFile;
+    return codegen.generateFunction(lf, pt, zcu.navSrcLoc(nav), func_index, air, &liveness) catch |err| switch (err) {
+        error.OutOfMemory,
+        error.CodegenFail,
+        => |e| return e,
+        error.Overflow,
+        error.RelocationNotByteAligned,
+        => return zcu.codegenFail(nav, "unable to codegen: {s}", .{@errorName(err)}),
+    };
+}

@@ -85,16 +85,104 @@ pub fn legalizeFeatures(pt: Zcu.PerThread, nav_index: InternPool.Nav.Index) ?*co
     }
 }
 
+/// Every code generation backend has a different MIR representation. However, we want to pass
+/// MIR from codegen to the linker *regardless* of which backend is in use. So, we use this: a
+/// union of all MIR types. The active tag is known from the backend in use; see `AnyMir.tag`.
+pub const AnyMir = union {
+    aarch64: @import("arch/aarch64/Mir.zig"),
+    arm: @import("arch/arm/Mir.zig"),
+    powerpc: noreturn, //@import("arch/powerpc/Mir.zig"),
+    riscv64: @import("arch/riscv64/Mir.zig"),
+    sparc64: @import("arch/sparc64/Mir.zig"),
+    x86_64: @import("arch/x86_64/Mir.zig"),
+    wasm: @import("arch/wasm/Mir.zig"),
+    c: @import("codegen/c.zig").Mir,
+
+    pub inline fn tag(comptime backend: std.builtin.CompilerBackend) []const u8 {
+        return switch (backend) {
+            .stage2_aarch64 => "aarch64",
+            .stage2_arm => "arm",
+            .stage2_powerpc => "powerpc",
+            .stage2_riscv64 => "riscv64",
+            .stage2_sparc64 => "sparc64",
+            .stage2_x86_64 => "x86_64",
+            .stage2_wasm => "wasm",
+            .stage2_c => "c",
+            else => unreachable,
+        };
+    }
+
+    pub fn deinit(mir: *AnyMir, zcu: *const Zcu) void {
+        const gpa = zcu.gpa;
+        const backend = target_util.zigBackend(zcu.root_mod.resolved_target.result, zcu.comp.config.use_llvm);
+        switch (backend) {
+            else => unreachable,
+            inline .stage2_aarch64,
+            .stage2_arm,
+            .stage2_powerpc,
+            .stage2_riscv64,
+            .stage2_sparc64,
+            .stage2_x86_64,
+            .stage2_c,
+            => |backend_ct| @field(mir, tag(backend_ct)).deinit(gpa),
+        }
+    }
+};
+
+/// Runs code generation for a function. This process converts the `Air` emitted by `Sema`,
+/// alongside annotated `Liveness` data, to machine code in the form of MIR (see `AnyMir`).
+///
+/// This is supposed to be a "pure" process, but some backends are currently buggy; see
+/// `Zcu.Feature.separate_thread` for details.
 pub fn generateFunction(
     lf: *link.File,
     pt: Zcu.PerThread,
     src_loc: Zcu.LazySrcLoc,
     func_index: InternPool.Index,
-    air: Air,
-    liveness: Air.Liveness,
+    air: *const Air,
+    liveness: *const Air.Liveness,
+) CodeGenError!AnyMir {
+    const zcu = pt.zcu;
+    const func = zcu.funcInfo(func_index);
+    const target = zcu.navFileScope(func.owner_nav).mod.?.resolved_target.result;
+    switch (target_util.zigBackend(target, false)) {
+        else => unreachable,
+        inline .stage2_aarch64,
+        .stage2_arm,
+        .stage2_powerpc,
+        .stage2_riscv64,
+        .stage2_sparc64,
+        .stage2_x86_64,
+        .stage2_c,
+        => |backend| {
+            dev.check(devFeatureForBackend(backend));
+            const CodeGen = importBackend(backend);
+            const mir = try CodeGen.generate(lf, pt, src_loc, func_index, air, liveness);
+            return @unionInit(AnyMir, AnyMir.tag(backend), mir);
+        },
+    }
+}
+
+/// Converts the MIR returned by `generateFunction` to finalized machine code to be placed in
+/// the output binary. This is called from linker implementations, and may query linker state.
+///
+/// This function is not called for the C backend, as `link.C` directly understands its MIR.
+///
+/// The `air` parameter is not supposed to exist, but some backends are currently buggy; see
+/// `Zcu.Feature.separate_thread` for details.
+pub fn emitFunction(
+    lf: *link.File,
+    pt: Zcu.PerThread,
+    src_loc: Zcu.LazySrcLoc,
+    func_index: InternPool.Index,
+    any_mir: *const AnyMir,
     code: *std.ArrayListUnmanaged(u8),
     debug_output: link.File.DebugInfoOutput,
-) CodeGenError!void {
+    /// TODO: this parameter needs to be removed. We should not still hold AIR this late
+    /// in the pipeline. Any information needed to call emit must be stored in MIR.
+    /// This is `undefined` if the backend supports the `separate_thread` feature.
+    air: *const Air,
+) Allocator.Error!void {
     const zcu = pt.zcu;
     const func = zcu.funcInfo(func_index);
     const target = zcu.navFileScope(func.owner_nav).mod.?.resolved_target.result;
@@ -108,7 +196,8 @@ pub fn generateFunction(
         .stage2_x86_64,
         => |backend| {
             dev.check(devFeatureForBackend(backend));
-            return importBackend(backend).generate(lf, pt, src_loc, func_index, air, liveness, code, debug_output);
+            const mir = &@field(any_mir, AnyMir.tag(backend));
+            return mir.emit(lf, pt, src_loc, func_index, code, debug_output, air);
         },
     }
 }

@@ -43,7 +43,6 @@ const Air = @import("Air.zig");
 const Builtin = @import("Builtin.zig");
 const LlvmObject = @import("codegen/llvm.zig").Object;
 const dev = @import("dev.zig");
-const ThreadSafeQueue = @import("ThreadSafeQueue.zig").ThreadSafeQueue;
 
 pub const Config = @import("Compilation/Config.zig");
 
@@ -113,17 +112,7 @@ win32_resource_table: if (dev.env.supports(.win32_resource)) std.AutoArrayHashMa
 } = .{},
 
 link_diags: link.Diags,
-link_task_queue: ThreadSafeQueue(link.Task) = .empty,
-/// Ensure only 1 simultaneous call to `flushTaskQueue`.
-link_task_queue_safety: std.debug.SafetyLock = .{},
-/// If any tasks are queued up that depend on prelink being finished, they are moved
-/// here until prelink finishes.
-link_task_queue_postponed: std.ArrayListUnmanaged(link.Task) = .empty,
-/// Initialized with how many link input tasks are expected. After this reaches zero
-/// the linker will begin the prelink phase.
-/// Initialized in the Compilation main thread before the pipeline; modified only in
-/// the linker task thread.
-remaining_prelink_tasks: u32,
+link_task_queue: link.Queue = .empty,
 
 /// Set of work that can be represented by only flags to determine whether the
 /// work is queued or not.
@@ -846,15 +835,24 @@ pub const RcIncludes = enum {
 };
 
 const Job = union(enum) {
-    /// Corresponds to the task in `link.Task`.
-    /// Only needed for backends that haven't yet been updated to not race against Sema.
+    /// Given the generated AIR for a function, put it onto the code generation queue.
+    /// This `Job` exists (instead of the `link.ZcuTask` being directly queued) to ensure that
+    /// all types are resolved before the linker task is queued.
+    /// If the backend does not support `Zcu.Feature.separate_thread`, codegen and linking happen immediately.
+    codegen_func: struct {
+        func: InternPool.Index,
+        /// The AIR emitted from analyzing `func`; owned by this `Job` in `gpa`.
+        air: Air,
+    },
+    /// Queue a `link.ZcuTask` to emit this non-function `Nav` into the output binary.
+    /// This `Job` exists (instead of the `link.ZcuTask` being directly queued) to ensure that
+    /// all types are resolved before the linker task is queued.
+    /// If the backend does not support `Zcu.Feature.separate_thread`, the task is run immediately.
     link_nav: InternPool.Nav.Index,
-    /// Corresponds to the task in `link.Task`.
-    /// TODO: this is currently also responsible for performing codegen.
-    /// Only needed for backends that haven't yet been updated to not race against Sema.
-    link_func: link.Task.CodegenFunc,
-    /// Corresponds to the task in `link.Task`.
-    /// Only needed for backends that haven't yet been updated to not race against Sema.
+    /// Queue a `link.ZcuTask` to emit debug information for this container type.
+    /// This `Job` exists (instead of the `link.ZcuTask` being directly queued) to ensure that
+    /// all types are resolved before the linker task is queued.
+    /// If the backend does not support `Zcu.Feature.separate_thread`, the task is run immediately.
     link_type: InternPool.Index,
     update_line_number: InternPool.TrackedInst.Index,
     /// The `AnalUnit`, which is *not* a `func`, must be semantically analyzed.
@@ -880,13 +878,13 @@ const Job = union(enum) {
         return switch (tag) {
             // Prioritize functions so that codegen can get to work on them on a
             // separate thread, while Sema goes back to its own work.
-            .resolve_type_fully, .analyze_func, .link_func => 0,
+            .resolve_type_fully, .analyze_func, .codegen_func => 0,
             else => 1,
         };
     }
     comptime {
         // Job dependencies
-        assert(stage(.resolve_type_fully) <= stage(.link_func));
+        assert(stage(.resolve_type_fully) <= stage(.codegen_func));
     }
 };
 
@@ -2004,7 +2002,6 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
             .file_system_inputs = options.file_system_inputs,
             .parent_whole_cache = options.parent_whole_cache,
             .link_diags = .init(gpa),
-            .remaining_prelink_tasks = 0,
         };
 
         // Prevent some footguns by making the "any" fields of config reflect
@@ -2213,7 +2210,7 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
         };
         comp.c_object_table.putAssumeCapacityNoClobber(c_object, {});
     }
-    comp.remaining_prelink_tasks += @intCast(comp.c_object_table.count());
+    comp.link_task_queue.pending_prelink_tasks += @intCast(comp.c_object_table.count());
 
     // Add a `Win32Resource` for each `rc_source_files` and one for `manifest_file`.
     const win32_resource_count =
@@ -2224,7 +2221,7 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
         // Add this after adding logic to updateWin32Resource to pass the
         // result into link.loadInput. loadInput integration is not implemented
         // for Windows linking logic yet.
-        //comp.remaining_prelink_tasks += @intCast(win32_resource_count);
+        //comp.link_task_queue.pending_prelink_tasks += @intCast(win32_resource_count);
         for (options.rc_source_files) |rc_source_file| {
             const win32_resource = try gpa.create(Win32Resource);
             errdefer gpa.destroy(win32_resource);
@@ -2275,78 +2272,76 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
                     const paths = try lci.resolveCrtPaths(arena, basenames, target);
 
                     const fields = @typeInfo(@TypeOf(paths)).@"struct".fields;
-                    try comp.link_task_queue.shared.ensureUnusedCapacity(gpa, fields.len + 1);
+                    try comp.link_task_queue.queued_prelink.ensureUnusedCapacity(gpa, fields.len + 1);
                     inline for (fields) |field| {
                         if (@field(paths, field.name)) |path| {
-                            comp.link_task_queue.shared.appendAssumeCapacity(.{ .load_object = path });
-                            comp.remaining_prelink_tasks += 1;
+                            comp.link_task_queue.queued_prelink.appendAssumeCapacity(.{ .load_object = path });
                         }
                     }
                     // Loads the libraries provided by `target_util.libcFullLinkFlags(target)`.
-                    comp.link_task_queue.shared.appendAssumeCapacity(.load_host_libc);
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.queued_prelink.appendAssumeCapacity(.load_host_libc);
                 } else if (target.isMuslLibC()) {
                     if (!std.zig.target.canBuildLibC(target)) return error.LibCUnavailable;
 
                     if (musl.needsCrt0(comp.config.output_mode, comp.config.link_mode, comp.config.pie)) |f| {
                         comp.queued_jobs.musl_crt_file[@intFromEnum(f)] = true;
-                        comp.remaining_prelink_tasks += 1;
+                        comp.link_task_queue.pending_prelink_tasks += 1;
                     }
                     switch (comp.config.link_mode) {
                         .static => comp.queued_jobs.musl_crt_file[@intFromEnum(musl.CrtFile.libc_a)] = true,
                         .dynamic => comp.queued_jobs.musl_crt_file[@intFromEnum(musl.CrtFile.libc_so)] = true,
                     }
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 } else if (target.isGnuLibC()) {
                     if (!std.zig.target.canBuildLibC(target)) return error.LibCUnavailable;
 
                     if (glibc.needsCrt0(comp.config.output_mode)) |f| {
                         comp.queued_jobs.glibc_crt_file[@intFromEnum(f)] = true;
-                        comp.remaining_prelink_tasks += 1;
+                        comp.link_task_queue.pending_prelink_tasks += 1;
                     }
                     comp.queued_jobs.glibc_shared_objects = true;
-                    comp.remaining_prelink_tasks += glibc.sharedObjectsCount(&target);
+                    comp.link_task_queue.pending_prelink_tasks += glibc.sharedObjectsCount(&target);
 
                     comp.queued_jobs.glibc_crt_file[@intFromEnum(glibc.CrtFile.libc_nonshared_a)] = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 } else if (target.isFreeBSDLibC()) {
                     if (!std.zig.target.canBuildLibC(target)) return error.LibCUnavailable;
 
                     if (freebsd.needsCrt0(comp.config.output_mode)) |f| {
                         comp.queued_jobs.freebsd_crt_file[@intFromEnum(f)] = true;
-                        comp.remaining_prelink_tasks += 1;
+                        comp.link_task_queue.pending_prelink_tasks += 1;
                     }
 
                     comp.queued_jobs.freebsd_shared_objects = true;
-                    comp.remaining_prelink_tasks += freebsd.sharedObjectsCount();
+                    comp.link_task_queue.pending_prelink_tasks += freebsd.sharedObjectsCount();
                 } else if (target.isNetBSDLibC()) {
                     if (!std.zig.target.canBuildLibC(target)) return error.LibCUnavailable;
 
                     if (netbsd.needsCrt0(comp.config.output_mode)) |f| {
                         comp.queued_jobs.netbsd_crt_file[@intFromEnum(f)] = true;
-                        comp.remaining_prelink_tasks += 1;
+                        comp.link_task_queue.pending_prelink_tasks += 1;
                     }
 
                     comp.queued_jobs.netbsd_shared_objects = true;
-                    comp.remaining_prelink_tasks += netbsd.sharedObjectsCount();
+                    comp.link_task_queue.pending_prelink_tasks += netbsd.sharedObjectsCount();
                 } else if (target.isWasiLibC()) {
                     if (!std.zig.target.canBuildLibC(target)) return error.LibCUnavailable;
 
                     for (comp.wasi_emulated_libs) |crt_file| {
                         comp.queued_jobs.wasi_libc_crt_file[@intFromEnum(crt_file)] = true;
                     }
-                    comp.remaining_prelink_tasks += @intCast(comp.wasi_emulated_libs.len);
+                    comp.link_task_queue.pending_prelink_tasks += @intCast(comp.wasi_emulated_libs.len);
 
                     comp.queued_jobs.wasi_libc_crt_file[@intFromEnum(wasi_libc.execModelCrtFile(comp.config.wasi_exec_model))] = true;
                     comp.queued_jobs.wasi_libc_crt_file[@intFromEnum(wasi_libc.CrtFile.libc_a)] = true;
-                    comp.remaining_prelink_tasks += 2;
+                    comp.link_task_queue.pending_prelink_tasks += 2;
                 } else if (target.isMinGW()) {
                     if (!std.zig.target.canBuildLibC(target)) return error.LibCUnavailable;
 
                     const main_crt_file: mingw.CrtFile = if (is_dyn_lib) .dllcrt2_o else .crt2_o;
                     comp.queued_jobs.mingw_crt_file[@intFromEnum(main_crt_file)] = true;
                     comp.queued_jobs.mingw_crt_file[@intFromEnum(mingw.CrtFile.libmingw32_lib)] = true;
-                    comp.remaining_prelink_tasks += 2;
+                    comp.link_task_queue.pending_prelink_tasks += 2;
 
                     // When linking mingw-w64 there are some import libs we always need.
                     try comp.windows_libs.ensureUnusedCapacity(gpa, mingw.always_link_libs.len);
@@ -2360,7 +2355,7 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
                     target.isMinGW())
                 {
                     comp.queued_jobs.zigc_lib = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 }
             }
 
@@ -2377,53 +2372,53 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
             }
             if (comp.wantBuildLibUnwindFromSource()) {
                 comp.queued_jobs.libunwind = true;
-                comp.remaining_prelink_tasks += 1;
+                comp.link_task_queue.pending_prelink_tasks += 1;
             }
             if (build_options.have_llvm and is_exe_or_dyn_lib and comp.config.link_libcpp) {
                 comp.queued_jobs.libcxx = true;
                 comp.queued_jobs.libcxxabi = true;
-                comp.remaining_prelink_tasks += 2;
+                comp.link_task_queue.pending_prelink_tasks += 2;
             }
             if (build_options.have_llvm and is_exe_or_dyn_lib and comp.config.any_sanitize_thread) {
                 comp.queued_jobs.libtsan = true;
-                comp.remaining_prelink_tasks += 1;
+                comp.link_task_queue.pending_prelink_tasks += 1;
             }
 
             if (can_build_compiler_rt) {
                 if (comp.compiler_rt_strat == .lib) {
                     log.debug("queuing a job to build compiler_rt_lib", .{});
                     comp.queued_jobs.compiler_rt_lib = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 } else if (comp.compiler_rt_strat == .obj) {
                     log.debug("queuing a job to build compiler_rt_obj", .{});
                     // In this case we are making a static library, so we ask
                     // for a compiler-rt object to put in it.
                     comp.queued_jobs.compiler_rt_obj = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 }
 
                 if (comp.ubsan_rt_strat == .lib) {
                     log.debug("queuing a job to build ubsan_rt_lib", .{});
                     comp.queued_jobs.ubsan_rt_lib = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 } else if (comp.ubsan_rt_strat == .obj) {
                     log.debug("queuing a job to build ubsan_rt_obj", .{});
                     comp.queued_jobs.ubsan_rt_obj = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 }
 
                 if (is_exe_or_dyn_lib and comp.config.any_fuzz) {
                     log.debug("queuing a job to build libfuzzer", .{});
                     comp.queued_jobs.fuzzer_lib = true;
-                    comp.remaining_prelink_tasks += 1;
+                    comp.link_task_queue.pending_prelink_tasks += 1;
                 }
             }
         }
 
-        try comp.link_task_queue.shared.append(gpa, .load_explicitly_provided);
-        comp.remaining_prelink_tasks += 1;
+        try comp.link_task_queue.queued_prelink.append(gpa, .load_explicitly_provided);
     }
-    log.debug("total prelink tasks: {d}", .{comp.remaining_prelink_tasks});
+    log.debug("queued prelink tasks: {d}", .{comp.link_task_queue.queued_prelink.items.len});
+    log.debug("pending prelink tasks: {d}", .{comp.link_task_queue.pending_prelink_tasks});
 
     return comp;
 }
@@ -2431,6 +2426,10 @@ pub fn create(gpa: Allocator, arena: Allocator, options: CreateOptions) !*Compil
 pub fn destroy(comp: *Compilation) void {
     const gpa = comp.gpa;
 
+    // This needs to be destroyed first, because it might contain MIR which we only know
+    // how to interpret (which kind of MIR it is) from `comp.bin_file`.
+    comp.link_task_queue.deinit(comp);
+
     if (comp.bin_file) |lf| lf.destroy();
     if (comp.zcu) |zcu| zcu.deinit();
     comp.cache_use.deinit();
@@ -2512,8 +2511,6 @@ pub fn destroy(comp: *Compilation) void {
     comp.failed_win32_resources.deinit(gpa);
 
     comp.link_diags.deinit();
-    comp.link_task_queue.deinit(gpa);
-    comp.link_task_queue_postponed.deinit(gpa);
 
     comp.clearMiscFailures();
 
@@ -4180,9 +4177,7 @@ fn performAllTheWorkInner(
     comp.link_task_wait_group.reset();
     defer comp.link_task_wait_group.wait();
 
-    if (comp.link_task_queue.start()) {
-        comp.thread_pool.spawnWgId(&comp.link_task_wait_group, link.flushTaskQueue, .{comp});
-    }
+    comp.link_task_queue.start(comp);
 
     if (comp.docs_emit != null) {
         dev.check(.docs_emit);
@@ -4498,7 +4493,7 @@ fn performAllTheWorkInner(
         comp.link_task_wait_group.wait();
         comp.link_task_wait_group.reset();
         std.log.scoped(.link).debug("finished waiting for link_task_wait_group", .{});
-        if (comp.remaining_prelink_tasks > 0) {
+        if (comp.link_task_queue.pending_prelink_tasks > 0) {
             // Indicates an error occurred preventing prelink phase from completing.
             return;
         }
@@ -4543,6 +4538,45 @@ pub fn queueJobs(comp: *Compilation, jobs: []const Job) !void {
 
 fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
     switch (job) {
+        .codegen_func => |func| {
+            const zcu = comp.zcu.?;
+            const gpa = zcu.gpa;
+            var air = func.air;
+            errdefer air.deinit(gpa);
+            if (!air.typesFullyResolved(zcu)) {
+                // Type resolution failed in a way which affects this function. This is a transitive
+                // failure, but it doesn't need recording, because this function semantically depends
+                // on the failed type, so when it is changed the function is updated.
+                air.deinit(gpa);
+                return;
+            }
+            const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
+            defer pt.deactivate();
+            const shared_mir = try gpa.create(link.ZcuTask.LinkFunc.SharedMir);
+            shared_mir.* = .{
+                .status = .init(.pending),
+                .value = undefined,
+            };
+            if (comp.separateCodegenThreadOk()) {
+                // `workerZcuCodegen` takes ownership of `air`.
+                comp.thread_pool.spawnWgId(&comp.link_task_wait_group, workerZcuCodegen, .{ comp, func.func, air, shared_mir });
+                comp.dispatchZcuLinkTask(tid, .{ .link_func = .{
+                    .func = func.func,
+                    .mir = shared_mir,
+                    .air = undefined,
+                } });
+            } else {
+                const emit_needs_air = !zcu.backendSupportsFeature(.separate_thread);
+                pt.runCodegen(func.func, &air, shared_mir);
+                assert(shared_mir.status.load(.monotonic) != .pending);
+                comp.dispatchZcuLinkTask(tid, .{ .link_func = .{
+                    .func = func.func,
+                    .mir = shared_mir,
+                    .air = if (emit_needs_air) &air else undefined,
+                } });
+                air.deinit(gpa);
+            }
+        },
         .link_nav => |nav_index| {
             const zcu = comp.zcu.?;
             const nav = zcu.intern_pool.getNav(nav_index);
@@ -4559,17 +4593,7 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
                 // on the failed type, so when it is changed the `Nav` will be updated.
                 return;
             }
-            comp.dispatchLinkTask(tid, .{ .link_nav = nav_index });
-        },
-        .link_func => |func| {
-            const zcu = comp.zcu.?;
-            if (!func.air.typesFullyResolved(zcu)) {
-                // Type resolution failed in a way which affects this function. This is a transitive
-                // failure, but it doesn't need recording, because this function semantically depends
-                // on the failed type, so when it is changed the function is updated.
-                return;
-            }
-            comp.dispatchLinkTask(tid, .{ .link_func = func });
+            comp.dispatchZcuLinkTask(tid, .{ .link_nav = nav_index });
         },
         .link_type => |ty| {
             const zcu = comp.zcu.?;
@@ -4580,10 +4604,10 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
                 // on the failed type, so when that is changed, this type will be updated.
                 return;
             }
-            comp.dispatchLinkTask(tid, .{ .link_type = ty });
+            comp.dispatchZcuLinkTask(tid, .{ .link_type = ty });
         },
         .update_line_number => |ti| {
-            comp.dispatchLinkTask(tid, .{ .update_line_number = ti });
+            comp.dispatchZcuLinkTask(tid, .{ .update_line_number = ti });
         },
         .analyze_func => |func| {
             const named_frame = tracy.namedFrame("analyze_func");
@@ -4675,18 +4699,7 @@ fn processOneJob(tid: usize, comp: *Compilation, job: Job) JobError!void {
     }
 }
 
-/// The reason for the double-queue here is that the first queue ensures any
-/// resolve_type_fully tasks are complete before this dispatch function is called.
-fn dispatchLinkTask(comp: *Compilation, tid: usize, link_task: link.Task) void {
-    if (comp.separateCodegenThreadOk()) {
-        comp.queueLinkTasks(&.{link_task});
-    } else {
-        assert(comp.remaining_prelink_tasks == 0);
-        link.doTask(comp, tid, link_task);
-    }
-}
-
-fn separateCodegenThreadOk(comp: *const Compilation) bool {
+pub fn separateCodegenThreadOk(comp: *const Compilation) bool {
     if (InternPool.single_threaded) return false;
     const zcu = comp.zcu orelse return true;
     return zcu.backendSupportsFeature(.separate_thread);
@@ -5273,6 +5286,21 @@ pub const RtOptions = struct {
     allow_lto: bool = true,
 };
 
+fn workerZcuCodegen(
+    tid: usize,
+    comp: *Compilation,
+    func_index: InternPool.Index,
+    orig_air: Air,
+    out: *link.ZcuTask.LinkFunc.SharedMir,
+) void {
+    var air = orig_air;
+    // We own `air` now, so we are responsbile for freeing it.
+    defer air.deinit(comp.gpa);
+    const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
+    defer pt.deactivate();
+    pt.runCodegen(func_index, &air, out);
+}
+
 fn buildRt(
     comp: *Compilation,
     root_source_name: []const u8,
@@ -5804,7 +5832,7 @@ fn updateCObject(comp: *Compilation, c_object: *CObject, c_obj_prog_node: std.Pr
         },
     };
 
-    comp.queueLinkTasks(&.{.{ .load_object = c_object.status.success.object_path }});
+    comp.queuePrelinkTasks(&.{.{ .load_object = c_object.status.success.object_path }});
 }
 
 fn updateWin32Resource(comp: *Compilation, win32_resource: *Win32Resource, win32_resource_prog_node: std.Progress.Node) !void {
@@ -7237,7 +7265,7 @@ fn buildOutputFromZig(
     assert(out.* == null);
     out.* = crt_file;
 
-    comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+    comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
 }
 
 pub const CrtFileOptions = struct {
@@ -7361,7 +7389,7 @@ pub fn build_crt_file(
     try comp.updateSubCompilation(sub_compilation, misc_task_tag, prog_node);
 
     const crt_file = try sub_compilation.toCrtFile();
-    comp.queueLinkTaskMode(crt_file.full_object_path, &config);
+    comp.queuePrelinkTaskMode(crt_file.full_object_path, &config);
 
     {
         comp.mutex.lock();
@@ -7371,8 +7399,8 @@ pub fn build_crt_file(
     }
 }
 
-pub fn queueLinkTaskMode(comp: *Compilation, path: Cache.Path, config: *const Compilation.Config) void {
-    comp.queueLinkTasks(switch (config.output_mode) {
+pub fn queuePrelinkTaskMode(comp: *Compilation, path: Cache.Path, config: *const Compilation.Config) void {
+    comp.queuePrelinkTasks(switch (config.output_mode) {
         .Exe => unreachable,
         .Obj => &.{.{ .load_object = path }},
         .Lib => &.{switch (config.link_mode) {
@@ -7384,12 +7412,30 @@ pub fn queueLinkTaskMode(comp: *Compilation, path: Cache.Path, config: *const Co
 
 /// Only valid to call during `update`. Automatically handles queuing up a
 /// linker worker task if there is not already one.
-pub fn queueLinkTasks(comp: *Compilation, tasks: []const link.Task) void {
-    if (comp.link_task_queue.enqueue(comp.gpa, tasks) catch |err| switch (err) {
+pub fn queuePrelinkTasks(comp: *Compilation, tasks: []const link.PrelinkTask) void {
+    comp.link_task_queue.enqueuePrelink(comp, tasks) catch |err| switch (err) {
         error.OutOfMemory => return comp.setAllocFailure(),
-    }) {
-        comp.thread_pool.spawnWgId(&comp.link_task_wait_group, link.flushTaskQueue, .{comp});
+    };
+}
+
+/// The reason for the double-queue here is that the first queue ensures any
+/// resolve_type_fully tasks are complete before this dispatch function is called.
+fn dispatchZcuLinkTask(comp: *Compilation, tid: usize, task: link.ZcuTask) void {
+    if (!comp.separateCodegenThreadOk()) {
+        assert(tid == 0);
+        if (task == .link_func) {
+            assert(task.link_func.mir.status.load(.monotonic) != .pending);
+        }
+        link.doZcuTask(comp, tid, task);
+        task.deinit(comp.zcu.?);
+        return;
     }
+    comp.link_task_queue.enqueueZcu(comp, task) catch |err| switch (err) {
+        error.OutOfMemory => {
+            task.deinit(comp.zcu.?);
+            comp.setAllocFailure();
+        },
+    };
 }
 
 pub fn toCrtFile(comp: *Compilation) Allocator.Error!CrtFile {

@@ -25,6 +25,9 @@ pub const Env = enum {
     /// - `zig build-* -fno-emit-bin`
     sema,
 
+    /// - `zig build-* -ofmt=c`
+    cbe,
+
     /// - sema
     /// - `zig build-* -fincremental -fno-llvm -fno-lld -target x86_64-linux --listen=-`
     @"x86_64-linux",
@@ -144,6 +147,12 @@ pub const Env = enum {
                 => true,
                 else => Env.ast_gen.supports(feature),
             },
+            .cbe => switch (feature) {
+                .c_backend,
+                .c_linker,
+                => true,
+                else => Env.sema.supports(feature),
+            },
             .@"x86_64-linux" => switch (feature) {
                 .build_command,
                 .stdio_listen,

@@ -21,11 +21,11 @@ const Type = @import("Type.zig");
 const Value = @import("Value.zig");
 const Package = @import("Package.zig");
 const dev = @import("dev.zig");
-const ThreadSafeQueue = @import("ThreadSafeQueue.zig").ThreadSafeQueue;
 const target_util = @import("target.zig");
 const codegen = @import("codegen.zig");
 
 pub const LdScript = @import("link/LdScript.zig");
+pub const Queue = @import("link/Queue.zig");
 
 pub const Diags = struct {
     /// Stored here so that function definitions can distinguish between
@@ -741,21 +741,26 @@ pub const File = struct {
     }
 
     /// May be called before or after updateExports for any given Decl.
-    /// TODO: currently `pub` because `Zcu.PerThread` is calling this.
+    /// The active tag of `mir` is determined by the backend used for the module this function is in.
     /// Never called when LLVM is codegenning the ZCU.
-    pub fn updateFunc(
+    fn updateFunc(
         base: *File,
         pt: Zcu.PerThread,
         func_index: InternPool.Index,
-        air: Air,
-        liveness: Air.Liveness,
+        /// This is owned by the caller, but the callee is permitted to mutate it provided
+        /// that `mir.deinit` remains legal for the caller. For instance, the callee can
+        /// take ownership of an embedded slice and replace it with `&.{}` in `mir`.
+        mir: *codegen.AnyMir,
+        /// This may be `undefined`; only pass it to `emitFunction`.
+        /// This parameter will eventually be removed.
+        maybe_undef_air: *const Air,
     ) UpdateNavError!void {
         assert(base.comp.zcu.?.llvm_object == null);
         switch (base.tag) {
             .lld => unreachable,
             inline else => |tag| {
                 dev.check(tag.devFeature());
-                return @as(*tag.Type(), @fieldParentPtr("base", base)).updateFunc(pt, func_index, air, liveness);
+                return @as(*tag.Type(), @fieldParentPtr("base", base)).updateFunc(pt, func_index, mir, maybe_undef_air);
             },
         }
     }
@@ -1213,40 +1218,7 @@ pub const File = struct {
     pub const Dwarf = @import("link/Dwarf.zig");
 };
 
-/// Does all the tasks in the queue. Runs in exactly one separate thread
-/// from the rest of compilation. All tasks performed here are
-/// single-threaded with respect to one another.
-pub fn flushTaskQueue(tid: usize, comp: *Compilation) void {
-    const diags = &comp.link_diags;
-    // As soon as check() is called, another `flushTaskQueue` call could occur,
-    // so the safety lock must go after the check.
-    while (comp.link_task_queue.check()) |tasks| {
-        comp.link_task_queue_safety.lock();
-        defer comp.link_task_queue_safety.unlock();
-
-        if (comp.remaining_prelink_tasks > 0) {
-            comp.link_task_queue_postponed.ensureUnusedCapacity(comp.gpa, tasks.len) catch |err| switch (err) {
-                error.OutOfMemory => return diags.setAllocFailure(),
-            };
-        }
-
-        for (tasks) |task| doTask(comp, tid, task);
-
-        if (comp.remaining_prelink_tasks == 0) {
-            if (comp.bin_file) |base| if (!base.post_prelink) {
-                base.prelink(comp.work_queue_progress_node) catch |err| switch (err) {
-                    error.OutOfMemory => diags.setAllocFailure(),
-                    error.LinkFailure => continue,
-                };
-                base.post_prelink = true;
-                for (comp.link_task_queue_postponed.items) |task| doTask(comp, tid, task);
-                comp.link_task_queue_postponed.clearRetainingCapacity();
-            };
-        }
-    }
-}
-
-pub const Task = union(enum) {
+pub const PrelinkTask = union(enum) {
     /// Loads the objects, shared objects, and archives that are already
     /// known from the command line.
     load_explicitly_provided,
@@ -1264,31 +1236,70 @@ pub const Task = union(enum) {
     /// Tells the linker to load an input which could be an object file,
     /// archive, or shared library.
     load_input: Input,
-
+};
+pub const ZcuTask = union(enum) {
     /// Write the constant value for a Decl to the output file.
     link_nav: InternPool.Nav.Index,
     /// Write the machine code for a function to the output file.
-    link_func: CodegenFunc,
+    link_func: LinkFunc,
     link_type: InternPool.Index,
-
     update_line_number: InternPool.TrackedInst.Index,
-
-    pub const CodegenFunc = struct {
+    pub fn deinit(task: ZcuTask, zcu: *const Zcu) void {
+        switch (task) {
+            .link_nav,
+            .link_type,
+            .update_line_number,
+            => {},
+            .link_func => |link_func| {
+                switch (link_func.mir.status.load(.monotonic)) {
+                    .pending => unreachable, // cannot deinit until MIR done
+                    .failed => {}, // MIR not populated so doesn't need freeing
+                    .ready => link_func.mir.value.deinit(zcu),
+                }
+                zcu.gpa.destroy(link_func.mir);
+            },
+        }
+    }
+    pub const LinkFunc = struct {
         /// This will either be a non-generic `func_decl` or a `func_instance`.
         func: InternPool.Index,
-        /// This `Air` is owned by the `Job` and allocated with `gpa`.
-        /// It must be deinited when the job is processed.
-        air: Air,
+        /// This pointer is allocated into `gpa` and must be freed when the `ZcuTask` is processed.
+        /// The pointer is shared with the codegen worker, which will populate the MIR inside once
+        /// it has been generated. It's important that the `link_func` is queued at the same time as
+        /// the codegen job to ensure that the linker receives functions in a deterministic order,
+        /// allowing reproducible builds.
+        mir: *SharedMir,
+        /// This field exists only due to deficiencies in some codegen implementations; it should
+        /// be removed when the corresponding parameter of `CodeGen.emitFunction` can be removed.
+        /// This is `undefined` if `Zcu.Feature.separate_thread` is supported.
+        /// If this is defined, its memory is owned externally; do not `deinit` this `air`.
+        air: *const Air,
+
+        pub const SharedMir = struct {
+            /// This is initially `.pending`. When `value` is populated, the codegen thread will set
+            /// this to `.ready`, and alert the queue if needed. It could also end up `.failed`.
+            /// The action of storing a value (other than `.pending`) to this atomic transfers
+            /// ownership of memory assoicated with `value` to this `ZcuTask`.
+            status: std.atomic.Value(enum(u8) {
+                /// We are waiting on codegen to generate MIR (or die trying).
+                pending,
+                /// `value` is not populated and will not be populated. Just drop the task from the queue and move on.
+                failed,
+                /// `value` is populated with the MIR from the backend in use, which is not LLVM.
+                ready,
+            }),
+            /// This is `undefined` until `ready` is set to `true`. Once populated, this MIR belongs
+            /// to the `ZcuTask`, and must be `deinit`ed when it is processed. Allocated into `gpa`.
+            value: codegen.AnyMir,
+        };
     };
 };
 
-pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
+pub fn doPrelinkTask(comp: *Compilation, task: PrelinkTask) void {
     const diags = &comp.link_diags;
+    const base = comp.bin_file orelse return;
     switch (task) {
         .load_explicitly_provided => {
-            comp.remaining_prelink_tasks -= 1;
-            const base = comp.bin_file orelse return;
-
             const prog_node = comp.work_queue_progress_node.start("Parse Linker Inputs", comp.link_inputs.len);
             defer prog_node.end();
             for (comp.link_inputs) |input| {
@@ -1306,9 +1317,6 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             }
         },
         .load_host_libc => {
-            comp.remaining_prelink_tasks -= 1;
-            const base = comp.bin_file orelse return;
-
             const prog_node = comp.work_queue_progress_node.start("Linker Parse Host libc", 0);
             defer prog_node.end();
 
@@ -1368,8 +1376,6 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             }
         },
         .load_object => |path| {
-            comp.remaining_prelink_tasks -= 1;
-            const base = comp.bin_file orelse return;
             const prog_node = comp.work_queue_progress_node.start("Linker Parse Object", 0);
             defer prog_node.end();
             base.openLoadObject(path) catch |err| switch (err) {
@@ -1378,8 +1384,6 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             };
         },
         .load_archive => |path| {
-            comp.remaining_prelink_tasks -= 1;
-            const base = comp.bin_file orelse return;
             const prog_node = comp.work_queue_progress_node.start("Linker Parse Archive", 0);
             defer prog_node.end();
             base.openLoadArchive(path, null) catch |err| switch (err) {
@@ -1388,8 +1392,6 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             };
         },
         .load_dso => |path| {
-            comp.remaining_prelink_tasks -= 1;
-            const base = comp.bin_file orelse return;
             const prog_node = comp.work_queue_progress_node.start("Linker Parse Shared Library", 0);
             defer prog_node.end();
             base.openLoadDso(path, .{
@@ -1401,8 +1403,6 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             };
         },
         .load_input => |input| {
-            comp.remaining_prelink_tasks -= 1;
-            const base = comp.bin_file orelse return;
             const prog_node = comp.work_queue_progress_node.start("Linker Parse Input", 0);
             defer prog_node.end();
             base.loadInput(input) catch |err| switch (err) {
@@ -1416,11 +1416,12 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
                 },
             };
         },
+    }
+}
+pub fn doZcuTask(comp: *Compilation, tid: usize, task: ZcuTask) void {
+    const diags = &comp.link_diags;
+    switch (task) {
         .link_nav => |nav_index| {
-            if (comp.remaining_prelink_tasks != 0) {
-                comp.link_task_queue_postponed.appendAssumeCapacity(task);
-                return;
-            }
             const zcu = comp.zcu.?;
             const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
             defer pt.deactivate();
@@ -1431,39 +1432,43 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             } else if (comp.bin_file) |lf| {
                 lf.updateNav(pt, nav_index) catch |err| switch (err) {
                     error.OutOfMemory => diags.setAllocFailure(),
-                    error.CodegenFail => assert(zcu.failed_codegen.contains(nav_index)),
+                    error.CodegenFail => zcu.assertCodegenFailed(nav_index),
                     error.Overflow, error.RelocationNotByteAligned => {
-                        zcu.failed_codegen.ensureUnusedCapacity(zcu.gpa, 1) catch return diags.setAllocFailure();
-                        const msg = Zcu.ErrorMsg.create(
-                            zcu.gpa,
-                            zcu.navSrcLoc(nav_index),
-                            "unable to codegen: {s}",
-                            .{@errorName(err)},
-                        ) catch return diags.setAllocFailure();
-                        zcu.failed_codegen.putAssumeCapacityNoClobber(nav_index, msg);
+                        switch (zcu.codegenFail(nav_index, "unable to codegen: {s}", .{@errorName(err)})) {
+                            error.CodegenFail => return,
+                            error.OutOfMemory => return diags.setAllocFailure(),
+                        }
                         // Not a retryable failure.
                     },
                 };
             }
         },
         .link_func => |func| {
-            if (comp.remaining_prelink_tasks != 0) {
-                comp.link_task_queue_postponed.appendAssumeCapacity(task);
-                return;
-            }
-            const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
+            const zcu = comp.zcu.?;
+            const nav = zcu.funcInfo(func.func).owner_nav;
+            const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
             defer pt.deactivate();
-            var air = func.air;
-            defer air.deinit(comp.gpa);
-            pt.linkerUpdateFunc(func.func, &air) catch |err| switch (err) {
-                error.OutOfMemory => diags.setAllocFailure(),
-            };
+            assert(zcu.llvm_object == null); // LLVM codegen doesn't produce MIR
+            switch (func.mir.status.load(.monotonic)) {
+                .pending => unreachable,
+                .ready => {},
+                .failed => return,
+            }
+            const mir = &func.mir.value;
+            if (comp.bin_file) |lf| {
+                lf.updateFunc(pt, func.func, mir, func.air) catch |err| switch (err) {
+                    error.OutOfMemory => return diags.setAllocFailure(),
+                    error.CodegenFail => return zcu.assertCodegenFailed(nav),
+                    error.Overflow, error.RelocationNotByteAligned => {
+                        switch (zcu.codegenFail(nav, "unable to codegen: {s}", .{@errorName(err)})) {
+                            error.OutOfMemory => return diags.setAllocFailure(),
+                            error.CodegenFail => return,
+                        }
+                    },
+                };
+            }
         },
         .link_type => |ty| {
-            if (comp.remaining_prelink_tasks != 0) {
-                comp.link_task_queue_postponed.appendAssumeCapacity(task);
-                return;
-            }
             const zcu = comp.zcu.?;
             const pt: Zcu.PerThread = .activate(zcu, @enumFromInt(tid));
             defer pt.deactivate();
@@ -1477,10 +1482,6 @@ pub fn doTask(comp: *Compilation, tid: usize, task: Task) void {
             }
         },
         .update_line_number => |ti| {
-            if (comp.remaining_prelink_tasks != 0) {
-                comp.link_task_queue_postponed.appendAssumeCapacity(task);
-                return;
-            }
             const pt: Zcu.PerThread = .activate(comp.zcu.?, @enumFromInt(tid));
             defer pt.deactivate();
             if (pt.zcu.llvm_object == null) {

@@ -850,7 +850,9 @@ pub inline fn backendSupportsFeature(backend: std.builtin.CompilerBackend, compt
         },
         .separate_thread => switch (backend) {
             .stage2_llvm => false,
-            else => true,
+            // MLUGG TODO
+            .stage2_c => true,
+            else => false,
         },
     };
 }

@@ -1,72 +0,0 @@
-const std = @import("std");
-const assert = std.debug.assert;
-const Allocator = std.mem.Allocator;
-
-pub fn ThreadSafeQueue(comptime T: type) type {
-    return struct {
-        worker_owned: std.ArrayListUnmanaged(T),
-        /// Protected by `mutex`.
-        shared: std.ArrayListUnmanaged(T),
-        mutex: std.Thread.Mutex,
-        state: State,
-
-        const Self = @This();
-
-        pub const State = enum { wait, run };
-
-        pub const empty: Self = .{
-            .worker_owned = .empty,
-            .shared = .empty,
-            .mutex = .{},
-            .state = .wait,
-        };
-
-        pub fn deinit(self: *Self, gpa: Allocator) void {
-            self.worker_owned.deinit(gpa);
-            self.shared.deinit(gpa);
-            self.* = undefined;
-        }
-
-        /// Must be called from the worker thread.
-        pub fn check(self: *Self) ?[]T {
-            assert(self.worker_owned.items.len == 0);
-            {
-                self.mutex.lock();
-                defer self.mutex.unlock();
-                assert(self.state == .run);
-                if (self.shared.items.len == 0) {
-                    self.state = .wait;
-                    return null;
-                }
-                std.mem.swap(std.ArrayListUnmanaged(T), &self.worker_owned, &self.shared);
-            }
-            const result = self.worker_owned.items;
-            self.worker_owned.clearRetainingCapacity();
-            return result;
-        }
-
-        /// Adds items to the queue, returning true if and only if the worker
-        /// thread is waiting. Thread-safe.
-        /// Not safe to call from the worker thread.
-        pub fn enqueue(self: *Self, gpa: Allocator, items: []const T) error{OutOfMemory}!bool {
-            self.mutex.lock();
-            defer self.mutex.unlock();
-            try self.shared.appendSlice(gpa, items);
-            return switch (self.state) {
-                .run => false,
-                .wait => {
-                    self.state = .run;
-                    return true;
-                },
-            };
-        }
-
-        /// Safe only to call exactly once when initially starting the worker.
-        pub fn start(self: *Self) bool {
-            assert(self.state == .wait);
-            if (self.shared.items.len == 0) return false;
-            self.state = .run;
-            return true;
-        }
-    };
-}

@@ -171,6 +171,8 @@ transitive_failed_analysis: std.AutoArrayHashMapUnmanaged(AnalUnit, void) = .emp
 /// This `Nav` succeeded analysis, but failed codegen.
 /// This may be a simple "value" `Nav`, or it may be a function.
 /// The ErrorMsg memory is owned by the `AnalUnit`, using Module's general purpose allocator.
+/// While multiple threads are active (most of the time!), this is guarded by `zcu.comp.mutex`, as
+/// codegen and linking run on a separate thread.
 failed_codegen: std.AutoArrayHashMapUnmanaged(InternPool.Nav.Index, *ErrorMsg) = .empty,
 failed_types: std.AutoArrayHashMapUnmanaged(InternPool.Index, *ErrorMsg) = .empty,
 /// Keep track of `@compileLog`s per `AnalUnit`.
@@ -3817,7 +3819,36 @@ pub const Feature = enum {
     is_named_enum_value,
     error_set_has_value,
     field_reordering,
-    /// If the backend supports running from another thread.
+    /// In theory, backends are supposed to work like this:
+    ///
+    /// * The AIR emitted by `Sema` is converted into MIR by `codegen.generateFunction`. This pass
+    ///   is "pure", in that it does not depend on or modify any external mutable state.
+    ///
+    /// * That MIR is sent to the linker, which calls `codegen.emitFunction` to convert the MIR to
+    ///   finalized machine code. This process is permitted to query and modify linker state.
+    ///
+    /// * The linker stores the resulting machine code in the binary as needed.
+    ///
+    /// The first stage described above can run in parallel to the rest of the compiler, and even to
+    /// other code generation work; we can run as many codegen threads as we want in parallel because
+    /// of the fact that this pass is pure. Emit and link must be single-threaded, but are generally
+    /// very fast, so that isn't a problem.
+    ///
+    /// Unfortunately, some code generation implementations currently query and/or mutate linker state
+    /// or even (in the case of the LLVM backend) semantic analysis state. Such backends cannot be run
+    /// in parallel with each other, with linking, or (potentially) with semantic analysis.
+    ///
+    /// Additionally, some backends continue to need the AIR in the "emit" stage, despite this pass
+    /// operating on MIR. This complicates memory management under the threading model above.
+    ///
+    /// These are both **bugs** in backend implementations, left over from legacy code. However, they
+    /// are difficult to fix. So, this `Feature` currently guards correct threading of code generation:
+    ///
+    /// * With this feature enabled, the backend is threaded as described above. The "emit" stage does
+    ///   not have access to AIR (it will be `undefined`; see `codegen.emitFunction`).
+    ///
+    /// * With this feature disabled, semantic analysis, code generation, and linking all occur on the
+    ///   same thread, and the "emit" stage has access to AIR.
     separate_thread,
 };
 
@@ -4566,22 +4597,29 @@ pub fn codegenFail(
     comptime format: []const u8,
     args: anytype,
 ) CodegenFailError {
-    const gpa = zcu.gpa;
-    try zcu.failed_codegen.ensureUnusedCapacity(gpa, 1);
-    const msg = try Zcu.ErrorMsg.create(gpa, zcu.navSrcLoc(nav_index), format, args);
-    zcu.failed_codegen.putAssumeCapacityNoClobber(nav_index, msg);
-    return error.CodegenFail;
+    const msg = try Zcu.ErrorMsg.create(zcu.gpa, zcu.navSrcLoc(nav_index), format, args);
+    return zcu.codegenFailMsg(nav_index, msg);
 }
 
+/// Takes ownership of `msg`, even on OOM.
 pub fn codegenFailMsg(zcu: *Zcu, nav_index: InternPool.Nav.Index, msg: *ErrorMsg) CodegenFailError {
     const gpa = zcu.gpa;
     {
+        zcu.comp.mutex.lock();
+        defer zcu.comp.mutex.unlock();
         errdefer msg.deinit(gpa);
         try zcu.failed_codegen.putNoClobber(gpa, nav_index, msg);
     }
     return error.CodegenFail;
 }
 
+/// Asserts that `zcu.failed_codegen` contains the key `nav`, with the necessary lock held.
+pub fn assertCodegenFailed(zcu: *Zcu, nav: InternPool.Nav.Index) void {
+    zcu.comp.mutex.lock();
+    defer zcu.comp.mutex.unlock();
+    assert(zcu.failed_codegen.contains(nav));
+}
+
 pub fn codegenFailType(
     zcu: *Zcu,
     ty_index: InternPool.Index,

Commit 9eb400ef19

Commit `9eb400ef19`