Commit 1059b57898

mlugg <mlugg@mlugg.co.uk>
2023-04-06 06:26:20
Liveness: defer deaths of externally-scoped instructions in loop bodies
1 parent 13aa787
src/arch/aarch64/CodeGen.zig
@@ -5000,9 +5000,17 @@ fn airLoop(self: *Self, inst: Air.Inst.Index) !void {
     const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
     const loop = self.air.extraData(Air.Block, ty_pl.payload);
     const body = self.air.extra[loop.end..][0..loop.data.body_len];
+    const liveness_loop = self.liveness.getLoop(inst);
     const start_index = @intCast(u32, self.mir_instructions.len);
+
     try self.genBody(body);
     try self.jump(start_index);
+
+    try self.ensureProcessDeathCapacity(liveness_loop.deaths.len);
+    for (liveness_loop.deaths) |operand| {
+        self.processDeath(operand);
+    }
+
     return self.finishAirBookkeeping();
 }
 
src/arch/arm/CodeGen.zig
@@ -4923,9 +4923,17 @@ fn airLoop(self: *Self, inst: Air.Inst.Index) !void {
     const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
     const loop = self.air.extraData(Air.Block, ty_pl.payload);
     const body = self.air.extra[loop.end..][0..loop.data.body_len];
+    const liveness_loop = self.liveness.getLoop(inst);
     const start_index = @intCast(Mir.Inst.Index, self.mir_instructions.len);
+
     try self.genBody(body);
     try self.jump(start_index);
+
+    try self.ensureProcessDeathCapacity(liveness_loop.deaths.len);
+    for (liveness_loop.deaths) |operand| {
+        self.processDeath(operand);
+    }
+
     return self.finishAirBookkeeping();
 }
 
src/arch/sparc64/CodeGen.zig
@@ -1750,9 +1750,17 @@ fn airLoop(self: *Self, inst: Air.Inst.Index) !void {
     const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
     const loop = self.air.extraData(Air.Block, ty_pl.payload);
     const body = self.air.extra[loop.end .. loop.end + loop.data.body_len];
+    const liveness_loop = self.liveness.getLoop(inst);
     const start = @intCast(u32, self.mir_instructions.len);
+
     try self.genBody(body);
     try self.jump(start);
+
+    try self.ensureProcessDeathCapacity(liveness_loop.deaths.len);
+    for (liveness_loop.deaths) |operand| {
+        self.processDeath(operand);
+    }
+
     return self.finishAirBookkeeping();
 }
 
src/arch/wasm/CodeGen.zig
@@ -3041,6 +3041,7 @@ fn airLoop(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const ty_pl = func.air.instructions.items(.data)[inst].ty_pl;
     const loop = func.air.extraData(Air.Block, ty_pl.payload);
     const body = func.air.extra[loop.end..][0..loop.data.body_len];
+    const liveness_loop = func.liveness.getLoop(inst);
 
     // result type of loop is always 'noreturn', meaning we can always
     // emit the wasm type 'block_empty'.
@@ -3051,6 +3052,11 @@ fn airLoop(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     try func.addLabel(.br, 0);
     try func.endBlock();
 
+    try func.currentBranch().values.ensureUnusedCapacity(func.gpa, @intCast(u32, liveness_loop.deaths.len));
+    for (liveness_loop.deaths) |death| {
+        func.processDeath(Air.indexToRef(death));
+    }
+
     func.finishAir(inst, .none, &.{});
 }
 
src/arch/x86_64/CodeGen.zig
@@ -6185,6 +6185,7 @@ fn airLoop(self: *Self, inst: Air.Inst.Index) !void {
     const loop = self.air.extraData(Air.Block, ty_pl.payload);
     const body = self.air.extra[loop.end..][0..loop.data.body_len];
     const jmp_target = @intCast(u32, self.mir_instructions.len);
+    const liveness_loop = self.liveness.getLoop(inst);
 
     {
         try self.branch_stack.append(.{});
@@ -6208,6 +6209,12 @@ fn airLoop(self: *Self, inst: Air.Inst.Index) !void {
     try self.canonicaliseBranches(true, &dummy_branch, &branch, true, false);
 
     _ = try self.asmJmpReloc(jmp_target);
+
+    try self.ensureProcessDeathCapacity(liveness_loop.deaths.len);
+    for (liveness_loop.deaths) |operand| {
+        self.processDeath(operand);
+    }
+
     return self.finishAirBookkeeping();
 }
 
src/codegen/c.zig
@@ -77,9 +77,8 @@ pub const LazyFnMap = std.AutoArrayHashMapUnmanaged(LazyFnKey, LazyFnValue);
 const LoopDepth = u16;
 const Local = struct {
     cty_idx: CType.Index,
-    /// How many loops the last definition was nested in.
-    loop_depth: LoopDepth,
     alignas: CType.AlignAs,
+    is_in_clone: bool,
 
     pub fn getType(local: Local) LocalType {
         return .{ .cty_idx = local.cty_idx, .alignas = local.alignas };
@@ -90,7 +89,6 @@ const LocalIndex = u16;
 const LocalType = struct { cty_idx: CType.Index, alignas: CType.AlignAs };
 const LocalsList = std.AutoArrayHashMapUnmanaged(LocalIndex, void);
 const LocalsMap = std.AutoArrayHashMapUnmanaged(LocalType, LocalsList);
-const LocalsStack = std.ArrayListUnmanaged(LocalsMap);
 
 const ValueRenderLocation = enum {
     FunctionArgument,
@@ -279,16 +277,16 @@ pub const Function = struct {
     /// Which locals are available for reuse, based on Type.
     /// Only locals in the last stack entry are available for reuse,
     /// other entries will become available on loop exit.
-    free_locals_stack: LocalsStack = .{},
-    free_locals_clone_depth: LoopDepth = 0,
+    free_locals_map: LocalsMap = .{},
+    is_in_clone: bool = false,
     /// Locals which will not be freed by Liveness. This is used after a
-    /// Function body is lowered in order to make `free_locals_stack` have
+    /// Function body is lowered in order to make `free_locals_map` have
     /// 100% of the locals within so that it can be used to render the block
     /// of variable declarations at the top of a function, sorted descending
     /// by type alignment.
     /// The value is whether the alloc is static or not.
     allocs: std.AutoArrayHashMapUnmanaged(LocalIndex, bool) = .{},
-    /// Needed for memory used by the keys of free_locals_stack entries.
+    /// Needed for memory used by the keys of free_locals_map entries.
     arena: std.heap.ArenaAllocator,
 
     fn resolveInst(f: *Function, inst: Air.Inst.Ref) !CValue {
@@ -323,18 +321,14 @@ pub const Function = struct {
         };
     }
 
-    fn getFreeLocals(f: *Function) *LocalsMap {
-        return &f.free_locals_stack.items[f.free_locals_stack.items.len - 1];
-    }
-
     /// Skips the reuse logic.
     fn allocLocalValue(f: *Function, ty: Type, alignment: u32) !CValue {
         const gpa = f.object.dg.gpa;
         const target = f.object.dg.module.getTarget();
         try f.locals.append(gpa, .{
             .cty_idx = try f.typeToIndex(ty, .complete),
-            .loop_depth = @intCast(LoopDepth, f.free_locals_stack.items.len - 1),
             .alignas = CType.AlignAs.init(alignment, ty.abiAlignment(target)),
+            .is_in_clone = f.is_in_clone,
         });
         return .{ .new_local = @intCast(LocalIndex, f.locals.items.len - 1) };
     }
@@ -348,13 +342,11 @@ pub const Function = struct {
     /// Only allocates the local; does not print anything.
     fn allocAlignedLocal(f: *Function, ty: Type, _: CQualifiers, alignment: u32) !CValue {
         const target = f.object.dg.module.getTarget();
-        if (f.getFreeLocals().getPtr(.{
+        if (f.free_locals_map.getPtr(.{
             .cty_idx = try f.typeToIndex(ty, .complete),
             .alignas = CType.AlignAs.init(alignment, ty.abiAlignment(target)),
         })) |locals_list| {
             if (locals_list.popOrNull()) |local_entry| {
-                const local = &f.locals.items[local_entry.key];
-                local.loop_depth = @intCast(LoopDepth, f.free_locals_stack.items.len - 1);
                 return .{ .new_local = local_entry.key };
             }
         }
@@ -485,10 +477,7 @@ pub const Function = struct {
         const gpa = f.object.dg.gpa;
         f.allocs.deinit(gpa);
         f.locals.deinit(gpa);
-        for (f.free_locals_stack.items) |*free_locals| {
-            deinitFreeLocalsMap(gpa, free_locals);
-        }
-        f.free_locals_stack.deinit(gpa);
+        deinitFreeLocalsMap(gpa, &f.free_locals_map);
         f.blocks.deinit(gpa);
         f.value_map.deinit();
         f.lazy_fns.deinit(gpa);
@@ -2592,8 +2581,7 @@ pub fn genFunc(f: *Function) !void {
     o.code_header.appendSliceAssumeCapacity("{\n ");
     const empty_header_len = o.code_header.items.len;
 
-    f.free_locals_stack.clearRetainingCapacity();
-    try f.free_locals_stack.append(gpa, .{});
+    f.free_locals_map.clearRetainingCapacity();
 
     const main_body = f.air.getMainBody();
     try genBody(f, main_body);
@@ -2605,7 +2593,7 @@ pub fn genFunc(f: *Function) !void {
     // Liveness analysis, however, locals from alloc instructions will be
     // missing. These are added now to complete the map. Then we can sort by
     // alignment, descending.
-    const free_locals = f.getFreeLocals();
+    const free_locals = &f.free_locals_map;
     for (f.allocs.keys(), f.allocs.values()) |local_index, value| {
         if (value) continue; // static
         const local = f.locals.items[local_index];
@@ -4630,30 +4618,16 @@ fn airLoop(f: *Function, inst: Air.Inst.Index) !CValue {
     const ty_pl = f.air.instructions.items(.data)[inst].ty_pl;
     const loop = f.air.extraData(Air.Block, ty_pl.payload);
     const body = f.air.extra[loop.end..][0..loop.data.body_len];
+    const liveness_loop = f.liveness.getLoop(inst);
     const writer = f.object.writer();
 
-    const gpa = f.object.dg.gpa;
-    try f.free_locals_stack.insert(gpa, f.free_locals_stack.items.len - 1, .{});
-
     try writer.writeAll("for (;;) ");
     try genBody(f, body);
     try writer.writeByte('\n');
 
-    var old_free_locals = f.free_locals_stack.pop();
-    defer deinitFreeLocalsMap(gpa, &old_free_locals);
-    const new_free_locals = f.getFreeLocals();
-    var it = new_free_locals.iterator();
-    while (it.next()) |entry| {
-        const gop = try old_free_locals.getOrPut(gpa, entry.key_ptr.*);
-        if (gop.found_existing) {
-            try gop.value_ptr.ensureUnusedCapacity(gpa, entry.value_ptr.count());
-            for (entry.value_ptr.keys()) |local_index| {
-                gop.value_ptr.putAssumeCapacityNoClobber(local_index, {});
-            }
-        } else gop.value_ptr.* = entry.value_ptr.move();
+    for (liveness_loop.deaths) |operand| {
+        try die(f, inst, Air.indexToRef(operand));
     }
-    deinitFreeLocalsMap(gpa, new_free_locals);
-    new_free_locals.* = old_free_locals.move();
 
     return .none;
 }
@@ -4673,7 +4647,7 @@ fn airCondBr(f: *Function, inst: Air.Inst.Index) !CValue {
     const gpa = f.object.dg.gpa;
     var cloned_map = try f.value_map.clone();
     defer cloned_map.deinit();
-    var cloned_frees = try cloneFreeLocalsMap(gpa, f.getFreeLocals());
+    var cloned_frees = try cloneFreeLocalsMap(gpa, &f.free_locals_map);
     defer deinitFreeLocalsMap(gpa, &cloned_frees);
 
     // Remember how many locals there were before entering the then branch so
@@ -4684,8 +4658,8 @@ fn airCondBr(f: *Function, inst: Air.Inst.Index) !CValue {
     // that we can notice and make sure not to use them in the else branch.
     // Any new allocs must be removed from the free list.
     const pre_allocs_len = @intCast(LocalIndex, f.allocs.count());
-    const pre_clone_depth = f.free_locals_clone_depth;
-    f.free_locals_clone_depth = @intCast(LoopDepth, f.free_locals_stack.items.len);
+    const was_in_clone = f.is_in_clone;
+    f.is_in_clone = true;
 
     for (liveness_condbr.then_deaths) |operand| {
         try die(f, inst, Air.indexToRef(operand));
@@ -4706,10 +4680,10 @@ fn airCondBr(f: *Function, inst: Air.Inst.Index) !CValue {
 
     f.value_map.deinit();
     f.value_map = cloned_map.move();
-    const free_locals = f.getFreeLocals();
+    const free_locals = &f.free_locals_map;
     deinitFreeLocalsMap(gpa, free_locals);
     free_locals.* = cloned_frees.move();
-    f.free_locals_clone_depth = pre_clone_depth;
+    f.is_in_clone = was_in_clone;
     for (liveness_condbr.else_deaths) |operand| {
         try die(f, inst, Air.indexToRef(operand));
     }
@@ -4781,7 +4755,7 @@ fn airSwitchBr(f: *Function, inst: Air.Inst.Index) !CValue {
         if (case_i != last_case_i) {
             const old_value_map = f.value_map;
             f.value_map = try old_value_map.clone();
-            var free_locals = f.getFreeLocals();
+            var free_locals = &f.free_locals_map;
             const old_free_locals = free_locals.*;
             free_locals.* = try cloneFreeLocalsMap(gpa, free_locals);
 
@@ -4793,14 +4767,13 @@ fn airSwitchBr(f: *Function, inst: Air.Inst.Index) !CValue {
             // we can notice and make sure not to use them in subsequent branches.
             // Any new allocs must be removed from the free list.
             const pre_allocs_len = @intCast(LocalIndex, f.allocs.count());
-            const pre_clone_depth = f.free_locals_clone_depth;
-            f.free_locals_clone_depth = @intCast(LoopDepth, f.free_locals_stack.items.len);
+            const was_in_clone = f.is_in_clone;
+            f.is_in_clone = true;
 
             {
                 defer {
-                    f.free_locals_clone_depth = pre_clone_depth;
+                    f.is_in_clone = was_in_clone;
                     f.value_map.deinit();
-                    free_locals = f.getFreeLocals();
                     deinitFreeLocalsMap(gpa, free_locals);
                     f.value_map = old_value_map;
                     free_locals.* = old_free_locals;
@@ -7870,8 +7843,8 @@ fn freeLocal(f: *Function, inst: Air.Inst.Index, local_index: LocalIndex, ref_in
     const gpa = f.object.dg.gpa;
     const local = &f.locals.items[local_index];
     log.debug("%{d}: freeing t{d} (operand %{d})", .{ inst, local_index, ref_inst });
-    if (local.loop_depth < f.free_locals_clone_depth) return;
-    const gop = try f.free_locals_stack.items[local.loop_depth].getOrPut(gpa, local.getType());
+    if (f.is_in_clone != local.is_in_clone) return;
+    const gop = try f.free_locals_map.getOrPut(gpa, local.getType());
     if (!gop.found_existing) gop.value_ptr.* = .{};
     if (std.debug.runtime_safety) {
         // If this trips, an unfreeable allocation was attempted to be freed.
@@ -7935,23 +7908,21 @@ fn noticeBranchFrees(
     pre_allocs_len: LocalIndex,
     inst: Air.Inst.Index,
 ) !void {
-    const free_locals = f.getFreeLocals();
-
     for (f.locals.items[pre_locals_len..], pre_locals_len..) |*local, local_i| {
         const local_index = @intCast(LocalIndex, local_i);
         if (f.allocs.contains(local_index)) {
             if (std.debug.runtime_safety) {
                 // new allocs are no longer freeable, so make sure they aren't in the free list
-                if (free_locals.getPtr(local.getType())) |locals_list| {
+                if (f.free_locals_map.getPtr(local.getType())) |locals_list| {
                     assert(!locals_list.contains(local_index));
                 }
             }
             continue;
         }
 
-        // free more deeply nested locals from other branches at current depth
-        assert(local.loop_depth >= f.free_locals_stack.items.len - 1);
-        local.loop_depth = @intCast(LoopDepth, f.free_locals_stack.items.len - 1);
+        // free cloned locals from other branches at current cloned-ness
+        std.debug.assert(local.is_in_clone or !f.is_in_clone);
+        local.is_in_clone = f.is_in_clone;
         try freeLocal(f, inst, local_index, 0);
     }
 
@@ -7959,6 +7930,6 @@ fn noticeBranchFrees(
         const local_index = @intCast(LocalIndex, local_i);
         const local = &f.locals.items[local_index];
         // new allocs are no longer freeable, so remove them from the free list
-        if (free_locals.getPtr(local.getType())) |locals_list| _ = locals_list.swapRemove(local_index);
+        if (f.free_locals_map.getPtr(local.getType())) |locals_list| _ = locals_list.swapRemove(local_index);
     }
 }
src/Liveness.zig
@@ -25,6 +25,7 @@ tomb_bits: []usize,
 /// array. The meaning of the data depends on the AIR tag.
 ///  * `cond_br` - points to a `CondBr` in `extra` at this index.
 ///  * `switch_br` - points to a `SwitchBr` in `extra` at this index.
+///  * `loop` - points to a `Loop` in `extra` at this index.
 ///  * `asm`, `call`, `aggregate_init` - the value is a set of bits which are the extra tomb
 ///    bits of operands.
 ///    The main tomb bits are still used and the extra ones are starting with the lsb of the
@@ -51,6 +52,11 @@ pub const SwitchBr = struct {
     else_death_count: u32,
 };
 
+/// Trailing is the set of instructions whose lifetimes end at the end of the loop body.
+pub const Loop = struct {
+    death_count: u32,
+};
+
 pub fn analyze(gpa: Allocator, air: Air) Allocator.Error!Liveness {
     const tracy = trace(@src());
     defer tracy.end();
@@ -76,6 +82,11 @@ pub fn analyze(gpa: Allocator, air: Air) Allocator.Error!Liveness {
     const main_body = air.getMainBody();
     try a.table.ensureTotalCapacity(gpa, @intCast(u32, main_body.len));
     try analyzeWithContext(&a, null, main_body);
+    {
+        var to_remove: std.AutoHashMapUnmanaged(Air.Inst.Index, void) = .{};
+        defer to_remove.deinit(gpa);
+        try removeDeaths(&a, &to_remove, main_body);
+    }
     return Liveness{
         .tomb_bits = a.tomb_bits,
         .special = a.special,
@@ -650,6 +661,18 @@ pub fn getSwitchBr(l: Liveness, gpa: Allocator, inst: Air.Inst.Index, cases_len:
     };
 }
 
+pub const LoopSlice = struct {
+    deaths: []const Air.Inst.Index,
+};
+
+pub fn getLoop(l: Liveness, inst: Air.Inst.Index) LoopSlice {
+    const index: usize = l.special.get(inst) orelse return .{
+        .deaths = &.{},
+    };
+    const death_count = l.extra[index];
+    return .{ .deaths = l.extra[index + 1 ..][0..death_count] };
+}
+
 pub fn deinit(l: *Liveness, gpa: Allocator) void {
     gpa.free(l.tomb_bits);
     gpa.free(l.extra);
@@ -1138,7 +1161,39 @@ fn analyzeInst(
         .loop => {
             const extra = a.air.extraData(Air.Block, inst_datas[inst].ty_pl.payload);
             const body = a.air.extra[extra.end..][0..extra.data.body_len];
-            try analyzeWithContext(a, new_set, body);
+
+            var body_table: std.AutoHashMapUnmanaged(Air.Inst.Index, void) = .{};
+            defer body_table.deinit(gpa);
+
+            // Instructions outside the loop body cannot die within the loop, since further loop
+            // iterations may occur. Track deaths from the loop body - we'll remove all of these
+            // retroactively, and add them to our extra data.
+
+            try analyzeWithContext(a, &body_table, body);
+
+            if (new_set) |ns| {
+                try ns.ensureUnusedCapacity(gpa, body_table.count());
+                var it = body_table.keyIterator();
+                while (it.next()) |key| {
+                    _ = ns.putAssumeCapacity(key.*, {});
+                }
+            }
+
+            try a.extra.ensureUnusedCapacity(gpa, std.meta.fields(Loop).len + body_table.count());
+            const extra_index = a.addExtraAssumeCapacity(Loop{
+                .death_count = body_table.count(),
+            });
+            {
+                var it = body_table.keyIterator();
+                while (it.next()) |key| {
+                    a.extra.appendAssumeCapacity(key.*);
+                }
+            }
+            try a.special.put(gpa, inst, extra_index);
+
+            // We'll remove invalid deaths in a separate pass after main liveness analysis. See
+            // removeDeaths for more details.
+
             return; // Loop has no operands and it is always unreferenced.
         },
         .@"try" => {
@@ -1412,3 +1467,566 @@ const ExtraTombs = struct {
         et.big_tomb_bits_extra.deinit(et.analysis.gpa);
     }
 };
+
+/// Remove any deaths invalidated by the deaths from an enclosing `loop`. Reshuffling deaths stored
+/// in `extra` causes it to become non-dense, but that's fine - we won't remove too much data.
+/// Making it dense would be a lot more work - it'd require recomputing every index in `special`.
+fn removeDeaths(
+    a: *Analysis,
+    to_remove: *std.AutoHashMapUnmanaged(Air.Inst.Index, void),
+    body: []const Air.Inst.Index,
+) error{OutOfMemory}!void {
+    for (body) |inst| {
+        try removeInstDeaths(a, to_remove, inst);
+    }
+}
+
+fn removeInstDeaths(
+    a: *Analysis,
+    to_remove: *std.AutoHashMapUnmanaged(Air.Inst.Index, void),
+    inst: Air.Inst.Index,
+) !void {
+    const inst_tags = a.air.instructions.items(.tag);
+    const inst_datas = a.air.instructions.items(.data);
+
+    switch (inst_tags[inst]) {
+        .add,
+        .add_optimized,
+        .addwrap,
+        .addwrap_optimized,
+        .add_sat,
+        .sub,
+        .sub_optimized,
+        .subwrap,
+        .subwrap_optimized,
+        .sub_sat,
+        .mul,
+        .mul_optimized,
+        .mulwrap,
+        .mulwrap_optimized,
+        .mul_sat,
+        .div_float,
+        .div_float_optimized,
+        .div_trunc,
+        .div_trunc_optimized,
+        .div_floor,
+        .div_floor_optimized,
+        .div_exact,
+        .div_exact_optimized,
+        .rem,
+        .rem_optimized,
+        .mod,
+        .mod_optimized,
+        .bit_and,
+        .bit_or,
+        .xor,
+        .cmp_lt,
+        .cmp_lt_optimized,
+        .cmp_lte,
+        .cmp_lte_optimized,
+        .cmp_eq,
+        .cmp_eq_optimized,
+        .cmp_gte,
+        .cmp_gte_optimized,
+        .cmp_gt,
+        .cmp_gt_optimized,
+        .cmp_neq,
+        .cmp_neq_optimized,
+        .bool_and,
+        .bool_or,
+        .store,
+        .array_elem_val,
+        .slice_elem_val,
+        .ptr_elem_val,
+        .shl,
+        .shl_exact,
+        .shl_sat,
+        .shr,
+        .shr_exact,
+        .atomic_store_unordered,
+        .atomic_store_monotonic,
+        .atomic_store_release,
+        .atomic_store_seq_cst,
+        .set_union_tag,
+        .min,
+        .max,
+        => {
+            const o = inst_datas[inst].bin_op;
+            removeOperandDeaths(a, to_remove, inst, .{ o.lhs, o.rhs, .none });
+        },
+
+        .vector_store_elem => {
+            const o = inst_datas[inst].vector_store_elem;
+            const extra = a.air.extraData(Air.Bin, o.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ o.vector_ptr, extra.lhs, extra.rhs });
+        },
+
+        .arg,
+        .alloc,
+        .ret_ptr,
+        .constant,
+        .const_ty,
+        .trap,
+        .breakpoint,
+        .dbg_stmt,
+        .dbg_inline_begin,
+        .dbg_inline_end,
+        .dbg_block_begin,
+        .dbg_block_end,
+        .unreach,
+        .fence,
+        .ret_addr,
+        .frame_addr,
+        .wasm_memory_size,
+        .err_return_trace,
+        .save_err_return_trace_index,
+        .c_va_start,
+        .work_item_id,
+        .work_group_size,
+        .work_group_id,
+        => {},
+
+        .not,
+        .bitcast,
+        .load,
+        .fpext,
+        .fptrunc,
+        .intcast,
+        .trunc,
+        .optional_payload,
+        .optional_payload_ptr,
+        .optional_payload_ptr_set,
+        .errunion_payload_ptr_set,
+        .wrap_optional,
+        .unwrap_errunion_payload,
+        .unwrap_errunion_err,
+        .unwrap_errunion_payload_ptr,
+        .unwrap_errunion_err_ptr,
+        .wrap_errunion_payload,
+        .wrap_errunion_err,
+        .slice_ptr,
+        .slice_len,
+        .ptr_slice_len_ptr,
+        .ptr_slice_ptr_ptr,
+        .struct_field_ptr_index_0,
+        .struct_field_ptr_index_1,
+        .struct_field_ptr_index_2,
+        .struct_field_ptr_index_3,
+        .array_to_slice,
+        .float_to_int,
+        .float_to_int_optimized,
+        .int_to_float,
+        .get_union_tag,
+        .clz,
+        .ctz,
+        .popcount,
+        .byte_swap,
+        .bit_reverse,
+        .splat,
+        .error_set_has_value,
+        .addrspace_cast,
+        .c_va_arg,
+        .c_va_copy,
+        => {
+            const o = inst_datas[inst].ty_op;
+            removeOperandDeaths(a, to_remove, inst, .{ o.operand, .none, .none });
+        },
+
+        .is_null,
+        .is_non_null,
+        .is_null_ptr,
+        .is_non_null_ptr,
+        .is_err,
+        .is_non_err,
+        .is_err_ptr,
+        .is_non_err_ptr,
+        .ptrtoint,
+        .bool_to_int,
+        .ret,
+        .ret_load,
+        .is_named_enum_value,
+        .tag_name,
+        .error_name,
+        .sqrt,
+        .sin,
+        .cos,
+        .tan,
+        .exp,
+        .exp2,
+        .log,
+        .log2,
+        .log10,
+        .fabs,
+        .floor,
+        .ceil,
+        .round,
+        .trunc_float,
+        .neg,
+        .neg_optimized,
+        .cmp_lt_errors_len,
+        .set_err_return_trace,
+        .c_va_end,
+        => {
+            const operand = inst_datas[inst].un_op;
+            removeOperandDeaths(a, to_remove, inst, .{ operand, .none, .none });
+        },
+
+        .add_with_overflow,
+        .sub_with_overflow,
+        .mul_with_overflow,
+        .shl_with_overflow,
+        .ptr_add,
+        .ptr_sub,
+        .ptr_elem_ptr,
+        .slice_elem_ptr,
+        .slice,
+        => {
+            const ty_pl = inst_datas[inst].ty_pl;
+            const extra = a.air.extraData(Air.Bin, ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.lhs, extra.rhs, .none });
+        },
+
+        .dbg_var_ptr,
+        .dbg_var_val,
+        => {
+            const operand = inst_datas[inst].pl_op.operand;
+            removeOperandDeaths(a, to_remove, inst, .{ operand, .none, .none });
+        },
+
+        .prefetch => {
+            const prefetch = inst_datas[inst].prefetch;
+            removeOperandDeaths(a, to_remove, inst, .{ prefetch.ptr, .none, .none });
+        },
+
+        .call, .call_always_tail, .call_never_tail, .call_never_inline => {
+            const inst_data = inst_datas[inst].pl_op;
+            const callee = inst_data.operand;
+            const extra = a.air.extraData(Air.Call, inst_data.payload);
+            const args = @ptrCast([]const Air.Inst.Ref, a.air.extra[extra.end..][0..extra.data.args_len]);
+
+            var death_remover = BigTombDeathRemover.init(a, to_remove, inst);
+            death_remover.feed(callee);
+            for (args) |operand| {
+                death_remover.feed(operand);
+            }
+            death_remover.finish();
+        },
+        .select => {
+            const pl_op = inst_datas[inst].pl_op;
+            const extra = a.air.extraData(Air.Bin, pl_op.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ pl_op.operand, extra.lhs, extra.rhs });
+        },
+        .shuffle => {
+            const extra = a.air.extraData(Air.Shuffle, inst_datas[inst].ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.a, extra.b, .none });
+        },
+        .reduce, .reduce_optimized => {
+            const reduce = inst_datas[inst].reduce;
+            removeOperandDeaths(a, to_remove, inst, .{ reduce.operand, .none, .none });
+        },
+        .cmp_vector, .cmp_vector_optimized => {
+            const extra = a.air.extraData(Air.VectorCmp, inst_datas[inst].ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.lhs, extra.rhs, .none });
+        },
+        .aggregate_init => {
+            const ty_pl = inst_datas[inst].ty_pl;
+            const aggregate_ty = a.air.getRefType(ty_pl.ty);
+            const len = @intCast(usize, aggregate_ty.arrayLen());
+            const elements = @ptrCast([]const Air.Inst.Ref, a.air.extra[ty_pl.payload..][0..len]);
+
+            var death_remover = BigTombDeathRemover.init(a, to_remove, inst);
+            for (elements) |elem| {
+                death_remover.feed(elem);
+            }
+            death_remover.finish();
+        },
+        .union_init => {
+            const extra = a.air.extraData(Air.UnionInit, inst_datas[inst].ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.init, .none, .none });
+        },
+        .struct_field_ptr, .struct_field_val => {
+            const extra = a.air.extraData(Air.StructField, inst_datas[inst].ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.struct_operand, .none, .none });
+        },
+        .field_parent_ptr => {
+            const extra = a.air.extraData(Air.FieldParentPtr, inst_datas[inst].ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.field_ptr, .none, .none });
+        },
+        .cmpxchg_strong, .cmpxchg_weak => {
+            const extra = a.air.extraData(Air.Cmpxchg, inst_datas[inst].ty_pl.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.ptr, extra.expected_value, extra.new_value });
+        },
+        .mul_add => {
+            const pl_op = inst_datas[inst].pl_op;
+            const extra = a.air.extraData(Air.Bin, pl_op.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ extra.lhs, extra.rhs, pl_op.operand });
+        },
+        .atomic_load => {
+            const ptr = inst_datas[inst].atomic_load.ptr;
+            removeOperandDeaths(a, to_remove, inst, .{ ptr, .none, .none });
+        },
+        .atomic_rmw => {
+            const pl_op = inst_datas[inst].pl_op;
+            const extra = a.air.extraData(Air.AtomicRmw, pl_op.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ pl_op.operand, extra.operand, .none });
+        },
+        .memset,
+        .memcpy,
+        => {
+            const pl_op = inst_datas[inst].pl_op;
+            const extra = a.air.extraData(Air.Bin, pl_op.payload).data;
+            removeOperandDeaths(a, to_remove, inst, .{ pl_op.operand, extra.lhs, extra.rhs });
+        },
+
+        .br => {
+            const br = inst_datas[inst].br;
+            removeOperandDeaths(a, to_remove, inst, .{ br.operand, .none, .none });
+        },
+        .assembly => {
+            const extra = a.air.extraData(Air.Asm, inst_datas[inst].ty_pl.payload);
+            var extra_i: usize = extra.end;
+            const outputs = @ptrCast([]const Air.Inst.Ref, a.air.extra[extra_i..][0..extra.data.outputs_len]);
+            extra_i += outputs.len;
+            const inputs = @ptrCast([]const Air.Inst.Ref, a.air.extra[extra_i..][0..extra.data.inputs_len]);
+            extra_i += inputs.len;
+
+            var death_remover = BigTombDeathRemover.init(a, to_remove, inst);
+            for (outputs) |output| {
+                if (output != .none) {
+                    death_remover.feed(output);
+                }
+            }
+            for (inputs) |input| {
+                death_remover.feed(input);
+            }
+            death_remover.finish();
+        },
+        .block => {
+            const extra = a.air.extraData(Air.Block, inst_datas[inst].ty_pl.payload);
+            const body = a.air.extra[extra.end..][0..extra.data.body_len];
+            try removeDeaths(a, to_remove, body);
+        },
+        .loop => {
+            const extra = a.air.extraData(Air.Block, inst_datas[inst].ty_pl.payload);
+            const body = a.air.extra[extra.end..][0..extra.data.body_len];
+
+            const liveness_extra_idx = a.special.get(inst) orelse {
+                try removeDeaths(a, to_remove, body);
+                return;
+            };
+
+            const death_count = a.extra.items[liveness_extra_idx];
+            var deaths = a.extra.items[liveness_extra_idx + 1 ..][0..death_count];
+
+            // Remove any deaths in `to_remove` from this loop's deaths
+            deaths.len = removeExtraDeaths(to_remove, deaths);
+            a.extra.items[liveness_extra_idx] = @intCast(u32, deaths.len);
+
+            // Temporarily add any deaths of ours to `to_remove`
+            try to_remove.ensureUnusedCapacity(a.gpa, @intCast(u32, deaths.len));
+            for (deaths) |d| {
+                to_remove.putAssumeCapacity(d, {});
+            }
+            try removeDeaths(a, to_remove, body);
+            for (deaths) |d| {
+                _ = to_remove.remove(d);
+            }
+        },
+        .@"try" => {
+            const pl_op = inst_datas[inst].pl_op;
+            const extra = a.air.extraData(Air.Try, pl_op.payload);
+            const body = a.air.extra[extra.end..][0..extra.data.body_len];
+            try removeDeaths(a, to_remove, body);
+            removeOperandDeaths(a, to_remove, inst, .{ pl_op.operand, .none, .none });
+        },
+        .try_ptr => {
+            const extra = a.air.extraData(Air.TryPtr, inst_datas[inst].ty_pl.payload);
+            const body = a.air.extra[extra.end..][0..extra.data.body_len];
+            try removeDeaths(a, to_remove, body);
+            removeOperandDeaths(a, to_remove, inst, .{ extra.data.ptr, .none, .none });
+        },
+        .cond_br => {
+            const inst_data = inst_datas[inst].pl_op;
+            const condition = inst_data.operand;
+            const extra = a.air.extraData(Air.CondBr, inst_data.payload);
+            const then_body = a.air.extra[extra.end..][0..extra.data.then_body_len];
+            const else_body = a.air.extra[extra.end + then_body.len ..][0..extra.data.else_body_len];
+
+            if (a.special.get(inst)) |liveness_extra_idx| {
+                const then_death_count = a.extra.items[liveness_extra_idx + 0];
+                const else_death_count = a.extra.items[liveness_extra_idx + 1];
+                var then_deaths = a.extra.items[liveness_extra_idx + 2 ..][0..then_death_count];
+                var else_deaths = a.extra.items[liveness_extra_idx + 2 + then_death_count ..][0..else_death_count];
+
+                const new_then_death_count = removeExtraDeaths(to_remove, then_deaths);
+                const new_else_death_count = removeExtraDeaths(to_remove, else_deaths);
+
+                a.extra.items[liveness_extra_idx + 0] = new_then_death_count;
+                a.extra.items[liveness_extra_idx + 1] = new_else_death_count;
+
+                if (new_then_death_count < then_death_count) {
+                    // `else` deaths need to be moved earlier in `extra`
+                    const src = a.extra.items[liveness_extra_idx + 2 + then_death_count ..];
+                    const dest = a.extra.items[liveness_extra_idx + 2 + new_then_death_count ..];
+                    std.mem.copy(u32, dest, src[0..new_else_death_count]);
+                }
+            }
+
+            try removeDeaths(a, to_remove, then_body);
+            try removeDeaths(a, to_remove, else_body);
+
+            removeOperandDeaths(a, to_remove, inst, .{ condition, .none, .none });
+        },
+        .switch_br => {
+            const pl_op = inst_datas[inst].pl_op;
+            const condition = pl_op.operand;
+            const switch_br = a.air.extraData(Air.SwitchBr, pl_op.payload);
+
+            var air_extra_index: usize = switch_br.end;
+            for (0..switch_br.data.cases_len) |_| {
+                const case = a.air.extraData(Air.SwitchBr.Case, air_extra_index);
+                const case_body = a.air.extra[case.end + case.data.items_len ..][0..case.data.body_len];
+                air_extra_index = case.end + case.data.items_len + case_body.len;
+                try removeDeaths(a, to_remove, case_body);
+            }
+            { // else
+                const else_body = a.air.extra[air_extra_index..][0..switch_br.data.else_body_len];
+                try removeDeaths(a, to_remove, else_body);
+            }
+
+            if (a.special.get(inst)) |liveness_extra_idx| {
+                const else_death_count = a.extra.items[liveness_extra_idx];
+                var read_idx = liveness_extra_idx + 1;
+                var write_idx = read_idx; // write_idx <= read_idx always
+                for (0..switch_br.data.cases_len) |_| {
+                    const case_death_count = a.extra.items[read_idx];
+                    const case_deaths = a.extra.items[read_idx + 1 ..][0..case_death_count];
+                    const new_death_count = removeExtraDeaths(to_remove, case_deaths);
+                    a.extra.items[write_idx] = new_death_count;
+                    if (write_idx < read_idx) {
+                        std.mem.copy(u32, a.extra.items[write_idx + 1 ..], a.extra.items[read_idx + 1 ..][0..new_death_count]);
+                    }
+                    read_idx += 1 + case_death_count;
+                    write_idx += 1 + new_death_count;
+                }
+                const else_deaths = a.extra.items[read_idx..][0..else_death_count];
+                const new_else_death_count = removeExtraDeaths(to_remove, else_deaths);
+                a.extra.items[liveness_extra_idx] = new_else_death_count;
+                if (write_idx < read_idx) {
+                    std.mem.copy(u32, a.extra.items[write_idx..], a.extra.items[read_idx..][0..new_else_death_count]);
+                }
+            }
+
+            removeOperandDeaths(a, to_remove, inst, .{ condition, .none, .none });
+        },
+        .wasm_memory_grow => {
+            const pl_op = inst_datas[inst].pl_op;
+            removeOperandDeaths(a, to_remove, inst, .{ pl_op.operand, .none, .none });
+        },
+    }
+}
+
+fn removeOperandDeaths(
+    a: *Analysis,
+    to_remove: *const std.AutoHashMapUnmanaged(Air.Inst.Index, void),
+    inst: Air.Inst.Index,
+    operands: [bpi - 1]Air.Inst.Ref,
+) void {
+    const usize_index = (inst * bpi) / @bitSizeOf(usize);
+
+    const cur_tomb = @truncate(Bpi, a.tomb_bits[usize_index] >>
+        @intCast(Log2Int(usize), (inst % (@bitSizeOf(usize) / bpi)) * bpi));
+
+    var toggle_bits: Bpi = 0;
+
+    for (operands, 0..) |op_ref, i| {
+        const mask = @as(Bpi, 1) << @intCast(OperandInt, i);
+        const op_int = @enumToInt(op_ref);
+        if (op_int < Air.Inst.Ref.typed_value_map.len) continue;
+        const operand: Air.Inst.Index = op_int - @intCast(u32, Air.Inst.Ref.typed_value_map.len);
+        if ((cur_tomb & mask) != 0 and to_remove.contains(operand)) {
+            log.debug("remove death of %{} in %{}", .{ operand, inst });
+            toggle_bits ^= mask;
+        }
+    }
+
+    a.tomb_bits[usize_index] ^= @as(usize, toggle_bits) <<
+        @intCast(Log2Int(usize), (inst % (@bitSizeOf(usize) / bpi)) * bpi);
+}
+
+fn removeExtraDeaths(
+    to_remove: *const std.AutoHashMapUnmanaged(Air.Inst.Index, void),
+    deaths: []Air.Inst.Index,
+) u32 {
+    var new_len = @intCast(u32, deaths.len);
+    var i: usize = 0;
+    while (i < new_len) {
+        if (to_remove.contains(deaths[i])) {
+            log.debug("remove extra death of %{}", .{deaths[i]});
+            deaths[i] = deaths[new_len - 1];
+            new_len -= 1;
+        } else {
+            i += 1;
+        }
+    }
+    return new_len;
+}
+
+const BigTombDeathRemover = struct {
+    a: *Analysis,
+    to_remove: *const std.AutoHashMapUnmanaged(Air.Inst.Index, void),
+    inst: Air.Inst.Index,
+
+    operands: [bpi - 1]Air.Inst.Ref = .{.none} ** (bpi - 1),
+    next_oper: OperandInt = 0,
+
+    bit_index: u32 = 0,
+    // Initialized once we finish the small tomb operands: see `feed`
+    extra_start: u32 = undefined,
+    extra_offset: u32 = 0,
+
+    fn init(a: *Analysis, to_remove: *const std.AutoHashMapUnmanaged(Air.Inst.Index, void), inst: Air.Inst.Index) BigTombDeathRemover {
+        return .{
+            .a = a,
+            .to_remove = to_remove,
+            .inst = inst,
+        };
+    }
+
+    fn feed(dr: *BigTombDeathRemover, operand: Air.Inst.Ref) void {
+        if (dr.next_oper < bpi - 1) {
+            dr.operands[dr.next_oper] = operand;
+            dr.next_oper += 1;
+            if (dr.next_oper == bpi - 1) {
+                removeOperandDeaths(dr.a, dr.to_remove, dr.inst, dr.operands);
+                if (dr.a.special.get(dr.inst)) |idx| dr.extra_start = idx;
+            }
+            return;
+        }
+
+        defer dr.bit_index += 1;
+
+        const op_int = @enumToInt(operand);
+        if (op_int < Air.Inst.Ref.typed_value_map.len) return;
+
+        const op_inst: Air.Inst.Index = op_int - @intCast(u32, Air.Inst.Ref.typed_value_map.len);
+
+        while (dr.bit_index - dr.extra_offset * 31 >= 31) {
+            dr.extra_offset += 1;
+        }
+        const dies = @truncate(u1, dr.a.extra.items[dr.extra_start + dr.extra_offset] >>
+            @intCast(u5, dr.bit_index - dr.extra_offset * 31)) != 0;
+
+        if (dies and dr.to_remove.contains(op_inst)) {
+            log.debug("remove big death of %{}", .{op_inst});
+            dr.a.extra.items[dr.extra_start + dr.extra_offset] ^=
+                (@as(u32, 1) << @intCast(u5, dr.bit_index - dr.extra_offset * 31));
+        }
+    }
+
+    fn finish(dr: *BigTombDeathRemover) void {
+        if (dr.next_oper < bpi) {
+            removeOperandDeaths(dr.a, dr.to_remove, dr.inst, dr.operands);
+        }
+    }
+};
src/print_air.zig
@@ -267,9 +267,9 @@ const Writer = struct {
             .c_va_copy,
             => try w.writeTyOp(s, inst),
 
-            .block,
-            .loop,
-            => try w.writeBlock(s, inst),
+            .block => try w.writeBlock(s, inst),
+
+            .loop => try w.writeLoop(s, inst),
 
             .slice,
             .slice_elem_ptr,
@@ -401,6 +401,31 @@ const Writer = struct {
         try s.writeAll("}");
     }
 
+    fn writeLoop(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
+        const ty_pl = w.air.instructions.items(.data)[inst].ty_pl;
+        const extra = w.air.extraData(Air.Block, ty_pl.payload);
+        const body = w.air.extra[extra.end..][0..extra.data.body_len];
+        const liveness_loop = w.liveness.getLoop(inst);
+
+        try w.writeType(s, w.air.getRefType(ty_pl.ty));
+        if (w.skip_body) return s.writeAll(", ...");
+        try s.writeAll(", {\n");
+        const old_indent = w.indent;
+        w.indent += 2;
+        try w.writeBody(s, body);
+        if (liveness_loop.deaths.len != 0) {
+            try s.writeByteNTimes(' ', w.indent);
+            for (liveness_loop.deaths, 0..) |operand, i| {
+                if (i != 0) try s.writeAll(" ");
+                try s.print("%{d}!", .{operand});
+            }
+            try s.writeAll("\n");
+        }
+        w.indent = old_indent;
+        try s.writeByteNTimes(' ', w.indent);
+        try s.writeAll("}");
+    }
+
     fn writeAggregateInit(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
         const ty_pl = w.air.instructions.items(.data)[inst].ty_pl;
         const vector_ty = w.air.getRefType(ty_pl.ty);
test/behavior/const_slice_child.zig
@@ -7,7 +7,6 @@ const expect = testing.expect;
 var argv: [*]const [*]const u8 = undefined;
 
 test "const slice child" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
test/behavior/for.zig
@@ -227,7 +227,6 @@ test "else continue outer for" {
 
 test "for loop with else branch" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
 
     {
         var x = [_]u32{ 1, 2 };
@@ -377,7 +376,6 @@ test "raw pointer and slice" {
 test "raw pointer and counter" {
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
 
     var buf: [10]u8 = undefined;
test/behavior/slice.zig
@@ -238,7 +238,6 @@ test "C pointer" {
 test "C pointer slice access" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
 
     var buf: [10]u32 = [1]u32{42} ** 10;