Commit `156cd8f678`

mlugg <mlugg@mlugg.co.uk>

2025-09-26 11:52:09

std.debug: significantly speed up capturing stack traces

By my estimation, these changes speed up DWARF unwinding when using the self-hosted x86_64 backend by around 7x. There are two very significant enhancements: we no longer iterate frames which don't fit in the stack trace buffer, and we cache register rules (in a fixed buffer) to avoid re-parsing and evaluating CFI instructions in most cases. Alongside this are a bunch of smaller enhancements, such as pre-caching the result of evaluating the CIE's initial instructions, avoiding re-parsing of CIEs, and big simplifications to the `Dwarf.Unwind.VirtualMachine` logic.

master

1 parent 3f84b6c

Changed files (8)

lib

std

debug

Dwarf

Unwind

SelfInfo

@@ -5,9 +5,9 @@ pub const RegisterRule = union(enum) {
     /// The spec says that the default rule for each column is the undefined rule.
     /// However, it also allows ABI / compiler authors to specify alternate defaults, so
     /// there is a distinction made here.
-    default: void,
-    undefined: void,
-    same_value: void,
+    default,
+    undefined,
+    same_value,
     /// offset(N)
     offset: i64,
     /// val_offset(N)
@@ -18,38 +18,39 @@ pub const RegisterRule = union(enum) {
     expression: []const u8,
     /// val_expression(E)
     val_expression: []const u8,
-    /// Augmenter-defined rule
-    architectural: void,
+};
+
+pub const CfaRule = union(enum) {
+    none,
+    reg_off: struct {
+        register: u8,
+        offset: i64,
+    },
+    expression: []const u8,
 };
 
 /// Each row contains unwinding rules for a set of registers.
 pub const Row = struct {
     /// Offset from `FrameDescriptionEntry.pc_begin`
     offset: u64 = 0,
-    /// Special-case column that defines the CFA (Canonical Frame Address) rule.
-    /// The register field of this column defines the register that CFA is derived from.
-    cfa: Column = .{},
+    cfa: CfaRule = .none,
     /// The register fields in these columns define the register the rule applies to.
-    columns: ColumnRange = .{},
-    /// Indicates that the next write to any column in this row needs to copy
-    /// the backing column storage first, as it may be referenced by previous rows.
-    copy_on_write: bool = false,
+    columns: ColumnRange = .{ .start = undefined, .len = 0 },
 };
 
 pub const Column = struct {
-    register: ?u8 = null,
-    rule: RegisterRule = .{ .default = {} },
+    register: u8,
+    rule: RegisterRule,
 };
 
 const ColumnRange = struct {
-    /// Index into `columns` of the first column in this row.
-    start: usize = undefined,
-    len: u8 = 0,
+    start: usize,
+    len: u8,
 };
 
 columns: std.ArrayList(Column) = .empty,
 stack: std.ArrayList(struct {
-    cfa: Column,
+    cfa: CfaRule,
     columns: ColumnRange,
 }) = .empty,
 current_row: Row = .{},
@@ -71,235 +72,388 @@ pub fn reset(self: *VirtualMachine) void {
 }
 
 /// Return a slice backed by the row's non-CFA columns
-pub fn rowColumns(self: VirtualMachine, row: Row) []Column {
+pub fn rowColumns(self: *const VirtualMachine, row: *const Row) []Column {
     if (row.columns.len == 0) return &.{};
     return self.columns.items[row.columns.start..][0..row.columns.len];
 }
 
 /// Either retrieves or adds a column for `register` (non-CFA) in the current row.
 fn getOrAddColumn(self: *VirtualMachine, gpa: Allocator, register: u8) !*Column {
-    for (self.rowColumns(self.current_row)) |*c| {
+    for (self.rowColumns(&self.current_row)) |*c| {
         if (c.register == register) return c;
     }
 
     if (self.current_row.columns.len == 0) {
         self.current_row.columns.start = self.columns.items.len;
+    } else {
+        assert(self.current_row.columns.start + self.current_row.columns.len == self.columns.items.len);
     }
     self.current_row.columns.len += 1;
 
     const column = try self.columns.addOne(gpa);
     column.* = .{
         .register = register,
+        .rule = .default,
     };
 
     return column;
 }
 
+pub fn populateCieLastRow(
+    gpa: Allocator,
+    cie: *Unwind.CommonInformationEntry,
+    addr_size_bytes: u8,
+    endian: std.builtin.Endian,
+) !void {
+    assert(cie.last_row == null);
+
+    var vm: VirtualMachine = .{};
+    defer vm.deinit(gpa);
+
+    try vm.evalInstructions(
+        gpa,
+        cie,
+        std.math.maxInt(u64),
+        cie.initial_instructions,
+        addr_size_bytes,
+        endian,
+    );
+
+    cie.last_row = .{
+        .offset = vm.current_row.offset,
+        .cfa = vm.current_row.cfa,
+        .cols = try gpa.dupe(Column, vm.rowColumns(&vm.current_row)),
+    };
+}
+
 /// Runs the CIE instructions, then the FDE instructions. Execution halts
 /// once the row that corresponds to `pc` is known, and the row is returned.
 pub fn runTo(
-    self: *VirtualMachine,
+    vm: *VirtualMachine,
     gpa: Allocator,
     pc: u64,
-    cie: Dwarf.Unwind.CommonInformationEntry,
-    fde: Dwarf.Unwind.FrameDescriptionEntry,
+    cie: *const Unwind.CommonInformationEntry,
+    fde: *const Unwind.FrameDescriptionEntry,
     addr_size_bytes: u8,
     endian: std.builtin.Endian,
 ) !Row {
-    assert(self.cie_row == null);
-    assert(pc >= fde.pc_begin);
-    assert(pc < fde.pc_begin + fde.pc_range);
+    assert(vm.cie_row == null);
 
-    var prev_row: Row = self.current_row;
+    const target_offset = pc - fde.pc_begin;
+    assert(target_offset < fde.pc_range);
 
-    const instruction_slices: [2][]const u8 = .{
-        cie.initial_instructions,
-        fde.instructions,
-    };
-    for (instruction_slices, [2]bool{ true, false }) |slice, is_cie_stream| {
-        var stream: std.Io.Reader = .fixed(slice);
-        while (stream.seek < slice.len) {
-            const instruction: Dwarf.call_frame.Instruction = try .read(&stream, addr_size_bytes, endian);
-            prev_row = try self.step(gpa, cie, is_cie_stream, instruction);
-            if (pc < fde.pc_begin + self.current_row.offset) return prev_row;
+    const instruction_bytes: []const u8 = insts: {
+        if (target_offset < cie.last_row.?.offset) {
+            break :insts cie.initial_instructions;
         }
-    }
+        // This is the more common case: start from the CIE's last row.
+        assert(vm.columns.items.len == 0);
+        vm.current_row = .{
+            .offset = cie.last_row.?.offset,
+            .cfa = cie.last_row.?.cfa,
+            .columns = .{
+                .start = 0,
+                .len = @intCast(cie.last_row.?.cols.len),
+            },
+        };
+        try vm.columns.appendSlice(gpa, cie.last_row.?.cols);
+        vm.cie_row = vm.current_row;
+        break :insts fde.instructions;
+    };
 
-    return self.current_row;
+    try vm.evalInstructions(
+        gpa,
+        cie,
+        target_offset,
+        instruction_bytes,
+        addr_size_bytes,
+        endian,
+    );
+    return vm.current_row;
 }
 
-fn resolveCopyOnWrite(self: *VirtualMachine, gpa: Allocator) !void {
-    if (!self.current_row.copy_on_write) return;
+/// Evaluates instructions from `instruction_bytes` until `target_addr` is reached or all
+/// instructions have been evaluated.
+fn evalInstructions(
+    vm: *VirtualMachine,
+    gpa: Allocator,
+    cie: *const Unwind.CommonInformationEntry,
+    target_addr: u64,
+    instruction_bytes: []const u8,
+    addr_size_bytes: u8,
+    endian: std.builtin.Endian,
+) !void {
+    var fr: std.Io.Reader = .fixed(instruction_bytes);
+    while (fr.seek < fr.buffer.len) {
+        switch (try Instruction.read(&fr, addr_size_bytes, endian)) {
+            .nop => {
+                // If there was one nop, there's a good chance we've reached the padding and so
+                // everything left is a nop, which is represented by a 0 byte.
+                if (std.mem.allEqual(u8, fr.buffered(), 0)) return;
+            },
+
+            .remember_state => {
+                try vm.stack.append(gpa, .{
+                    .cfa = vm.current_row.cfa,
+                    .columns = vm.current_row.columns,
+                });
+                const cols_len = vm.current_row.columns.len;
+                const copy_start = vm.columns.items.len;
+                assert(vm.current_row.columns.start == copy_start - cols_len);
+                try vm.columns.ensureUnusedCapacity(gpa, cols_len); // to prevent aliasing issues
+                vm.columns.appendSliceAssumeCapacity(vm.columns.items[copy_start - cols_len ..]);
+                vm.current_row.columns.start = copy_start;
+            },
+            .restore_state => {
+                const restored = vm.stack.pop() orelse return error.InvalidOperation;
+                vm.columns.shrinkRetainingCapacity(restored.columns.start + restored.columns.len);
+
+                vm.current_row.cfa = restored.cfa;
+                vm.current_row.columns = restored.columns;
+            },
 
-    const new_start = self.columns.items.len;
-    if (self.current_row.columns.len > 0) {
-        try self.columns.ensureUnusedCapacity(gpa, self.current_row.columns.len);
-        self.columns.appendSliceAssumeCapacity(self.rowColumns(self.current_row));
-        self.current_row.columns.start = new_start;
+            .advance_loc => |delta| {
+                const new_addr = vm.current_row.offset + delta * cie.code_alignment_factor;
+                if (new_addr > target_addr) return;
+                vm.current_row.offset = new_addr;
+            },
+            .set_loc => |new_addr| {
+                if (new_addr <= vm.current_row.offset) return error.InvalidOperation;
+                if (cie.segment_selector_size != 0) return error.InvalidOperation; // unsupported
+                // TODO: Check cie.segment_selector_size != 0 for DWARFV4
+
+                if (new_addr > target_addr) return;
+                vm.current_row.offset = new_addr;
+            },
+
+            .register => |reg| {
+                const column = try vm.getOrAddColumn(gpa, reg.index);
+                column.rule = switch (reg.rule) {
+                    .restore => rule: {
+                        const cie_row = &(vm.cie_row orelse return error.InvalidOperation);
+                        for (vm.rowColumns(cie_row)) |cie_col| {
+                            if (cie_col.register == reg.index) break :rule cie_col.rule;
+                        }
+                        break :rule .default;
+                    },
+                    .undefined => .undefined,
+                    .same_value => .same_value,
+                    .offset_uf => |off| .{ .offset = @as(i64, @intCast(off)) * cie.data_alignment_factor },
+                    .offset_sf => |off| .{ .offset = off * cie.data_alignment_factor },
+                    .val_offset_uf => |off| .{ .val_offset = @as(i64, @intCast(off)) * cie.data_alignment_factor },
+                    .val_offset_sf => |off| .{ .val_offset = off * cie.data_alignment_factor },
+                    .register => |callee_reg| .{ .register = callee_reg },
+                    .expr => |len| .{ .expression = try takeExprBlock(&fr, len) },
+                    .val_expr => |len| .{ .val_expression = try takeExprBlock(&fr, len) },
+                };
+            },
+            .def_cfa => |cfa| vm.current_row.cfa = .{ .reg_off = .{
+                .register = cfa.register,
+                .offset = @intCast(cfa.offset),
+            } },
+            .def_cfa_sf => |cfa| vm.current_row.cfa = .{ .reg_off = .{
+                .register = cfa.register,
+                .offset = cfa.offset_sf * cie.data_alignment_factor,
+            } },
+            .def_cfa_reg => |register| switch (vm.current_row.cfa) {
+                .none, .expression => return error.InvalidOperation,
+                .reg_off => |*ro| ro.register = register,
+            },
+            .def_cfa_offset => |offset| switch (vm.current_row.cfa) {
+                .none, .expression => return error.InvalidOperation,
+                .reg_off => |*ro| ro.offset = @intCast(offset),
+            },
+            .def_cfa_offset_sf => |offset_sf| switch (vm.current_row.cfa) {
+                .none, .expression => return error.InvalidOperation,
+                .reg_off => |*ro| ro.offset = offset_sf * cie.data_alignment_factor,
+            },
+            .def_cfa_expr => |len| {
+                vm.current_row.cfa = .{ .expression = try takeExprBlock(&fr, len) };
+            },
+        }
     }
 }
 
-/// Executes a single instruction.
-/// If this instruction is from the CIE, `is_initial` should be set.
-/// Returns the value of `current_row` before executing this instruction.
-pub fn step(
-    self: *VirtualMachine,
-    gpa: Allocator,
-    cie: Dwarf.Unwind.CommonInformationEntry,
-    is_initial: bool,
-    instruction: Dwarf.call_frame.Instruction,
-) !Row {
-    // CIE instructions must be run before FDE instructions
-    assert(!is_initial or self.cie_row == null);
-    if (!is_initial and self.cie_row == null) {
-        self.cie_row = self.current_row;
-        self.current_row.copy_on_write = true;
-    }
+fn takeExprBlock(r: *std.Io.Reader, len: usize) error{ ReadFailed, InvalidOperand }![]const u8 {
+    return r.take(len) catch |err| switch (err) {
+        error.ReadFailed => |e| return e,
+        error.EndOfStream => return error.InvalidOperand,
+    };
+}
 
-    const prev_row = self.current_row;
-    switch (instruction) {
-        .set_loc => |i| {
-            if (i.address <= self.current_row.offset) return error.InvalidOperation;
-            if (cie.segment_selector_size != 0) return error.InvalidOperation; // unsupported
-            // TODO: Check cie.segment_selector_size != 0 for DWARFV4
-            self.current_row.offset = i.address;
-        },
-        inline .advance_loc,
-        .advance_loc1,
-        .advance_loc2,
-        .advance_loc4,
-        => |i| {
-            self.current_row.offset += i.delta * cie.code_alignment_factor;
-            self.current_row.copy_on_write = true;
-        },
-        inline .offset,
-        .offset_extended,
-        .offset_extended_sf,
-        => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{ .offset = @as(i64, @intCast(i.offset)) * cie.data_alignment_factor };
-        },
-        inline .restore,
-        .restore_extended,
-        => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            if (self.cie_row) |cie_row| {
-                const column = try self.getOrAddColumn(gpa, i.register);
-                column.rule = for (self.rowColumns(cie_row)) |cie_column| {
-                    if (cie_column.register == i.register) break cie_column.rule;
-                } else .{ .default = {} };
-            } else return error.InvalidOperation;
-        },
-        .nop => {},
-        .undefined => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{ .undefined = {} };
-        },
-        .same_value => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{ .same_value = {} };
-        },
-        .register => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{ .register = i.target_register };
-        },
-        .remember_state => {
-            try self.stack.append(gpa, .{
-                .cfa = self.current_row.cfa,
-                .columns = self.current_row.columns,
-            });
-            self.current_row.copy_on_write = true;
-        },
-        .restore_state => {
-            const restored = self.stack.pop() orelse return error.InvalidOperation;
-            self.columns.shrinkRetainingCapacity(self.columns.items.len - self.current_row.columns.len);
-            try self.columns.ensureUnusedCapacity(gpa, restored.columns.len);
-
-            self.current_row.cfa = restored.cfa;
-            self.current_row.columns.start = self.columns.items.len;
-            self.current_row.columns.len = restored.columns.len;
-            self.columns.appendSliceAssumeCapacity(self.columns.items[restored.columns.start..][0..restored.columns.len]);
-        },
-        .def_cfa => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            self.current_row.cfa = .{
-                .register = i.register,
-                .rule = .{ .val_offset = @intCast(i.offset) },
-            };
-        },
-        .def_cfa_sf => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            self.current_row.cfa = .{
-                .register = i.register,
-                .rule = .{ .val_offset = i.offset * cie.data_alignment_factor },
-            };
-        },
-        .def_cfa_register => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            if (self.current_row.cfa.register == null or self.current_row.cfa.rule != .val_offset) return error.InvalidOperation;
-            self.current_row.cfa.register = i.register;
-        },
-        .def_cfa_offset => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            if (self.current_row.cfa.register == null or self.current_row.cfa.rule != .val_offset) return error.InvalidOperation;
-            self.current_row.cfa.rule = .{
-                .val_offset = @intCast(i.offset),
-            };
-        },
-        .def_cfa_offset_sf => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            if (self.current_row.cfa.register == null or self.current_row.cfa.rule != .val_offset) return error.InvalidOperation;
-            self.current_row.cfa.rule = .{
-                .val_offset = i.offset * cie.data_alignment_factor,
-            };
-        },
-        .def_cfa_expression => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            self.current_row.cfa.register = undefined;
-            self.current_row.cfa.rule = .{
-                .expression = i.block,
-            };
+const OpcodeByte = packed struct(u8) {
+    low: packed union {
+        operand: u6,
+        extended: enum(u6) {
+            nop = 0,
+            set_loc = 1,
+            advance_loc1 = 2,
+            advance_loc2 = 3,
+            advance_loc4 = 4,
+            offset_extended = 5,
+            restore_extended = 6,
+            undefined = 7,
+            same_value = 8,
+            register = 9,
+            remember_state = 10,
+            restore_state = 11,
+            def_cfa = 12,
+            def_cfa_register = 13,
+            def_cfa_offset = 14,
+            def_cfa_expression = 15,
+            expression = 16,
+            offset_extended_sf = 17,
+            def_cfa_sf = 18,
+            def_cfa_offset_sf = 19,
+            val_offset = 20,
+            val_offset_sf = 21,
+            val_expression = 22,
+            _,
         },
-        .expression => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{
-                .expression = i.block,
-            };
-        },
-        .val_offset => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{
-                .val_offset = @as(i64, @intCast(i.offset)) * cie.data_alignment_factor,
-            };
-        },
-        .val_offset_sf => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{
-                .val_offset = i.offset * cie.data_alignment_factor,
-            };
-        },
-        .val_expression => |i| {
-            try self.resolveCopyOnWrite(gpa);
-            const column = try self.getOrAddColumn(gpa, i.register);
-            column.rule = .{
-                .val_expression = i.block,
-            };
+    },
+    opcode: enum(u2) {
+        extended = 0,
+        advance_loc = 1,
+        offset = 2,
+        restore = 3,
+    },
+};
+
+pub const Instruction = union(enum) {
+    nop,
+    remember_state,
+    restore_state,
+    advance_loc: u32,
+    set_loc: u64,
+
+    register: struct {
+        index: u8,
+        rule: union(enum) {
+            restore, // restore from cie
+            undefined,
+            same_value,
+            offset_uf: u64,
+            offset_sf: i64,
+            val_offset_uf: u64,
+            val_offset_sf: i64,
+            register: u8,
+            /// Value is the number of bytes in the DWARF expression, which the caller must read.
+            expr: usize,
+            /// Value is the number of bytes in the DWARF expression, which the caller must read.
+            val_expr: usize,
         },
-    }
+    },
 
-    return prev_row;
-}
+    def_cfa: struct {
+        register: u8,
+        offset: u64,
+    },
+    def_cfa_sf: struct {
+        register: u8,
+        offset_sf: i64,
+    },
+    def_cfa_reg: u8,
+    def_cfa_offset: u64,
+    def_cfa_offset_sf: i64,
+    /// Value is the number of bytes in the DWARF expression, which the caller must read.
+    def_cfa_expr: usize,
+
+    pub fn read(
+        reader: *std.Io.Reader,
+        addr_size_bytes: u8,
+        endian: std.builtin.Endian,
+    ) !Instruction {
+        const inst: OpcodeByte = @bitCast(try reader.takeByte());
+        return switch (inst.opcode) {
+            .advance_loc => .{ .advance_loc = inst.low.operand },
+            .offset => .{ .register = .{
+                .index = inst.low.operand,
+                .rule = .{ .offset_uf = try reader.takeLeb128(u64) },
+            } },
+            .restore => .{ .register = .{
+                .index = inst.low.operand,
+                .rule = .restore,
+            } },
+            .extended => switch (inst.low.extended) {
+                .nop => .nop,
+                .remember_state => .remember_state,
+                .restore_state => .restore_state,
+                .advance_loc1 => .{ .advance_loc = try reader.takeByte() },
+                .advance_loc2 => .{ .advance_loc = try reader.takeInt(u16, endian) },
+                .advance_loc4 => .{ .advance_loc = try reader.takeInt(u32, endian) },
+                .set_loc => .{ .set_loc = switch (addr_size_bytes) {
+                    2 => try reader.takeInt(u16, endian),
+                    4 => try reader.takeInt(u32, endian),
+                    8 => try reader.takeInt(u64, endian),
+                    else => return error.UnsupportedAddrSize,
+                } },
+
+                .offset_extended => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .offset_uf = try reader.takeLeb128(u64) },
+                } },
+                .offset_extended_sf => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .offset_sf = try reader.takeLeb128(i64) },
+                } },
+                .restore_extended => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .restore,
+                } },
+                .undefined => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .undefined,
+                } },
+                .same_value => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .same_value,
+                } },
+                .register => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .register = try reader.takeLeb128(u8) },
+                } },
+                .val_offset => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .val_offset_uf = try reader.takeLeb128(u64) },
+                } },
+                .val_offset_sf => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .val_offset_sf = try reader.takeLeb128(i64) },
+                } },
+                .expression => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .expr = try reader.takeLeb128(usize) },
+                } },
+                .val_expression => .{ .register = .{
+                    .index = try reader.takeLeb128(u8),
+                    .rule = .{ .val_expr = try reader.takeLeb128(usize) },
+                } },
+
+                .def_cfa => .{ .def_cfa = .{
+                    .register = try reader.takeLeb128(u8),
+                    .offset = try reader.takeLeb128(u64),
+                } },
+                .def_cfa_sf => .{ .def_cfa_sf = .{
+                    .register = try reader.takeLeb128(u8),
+                    .offset_sf = try reader.takeLeb128(i64),
+                } },
+                .def_cfa_register => .{ .def_cfa_reg = try reader.takeLeb128(u8) },
+                .def_cfa_offset => .{ .def_cfa_offset = try reader.takeLeb128(u64) },
+                .def_cfa_offset_sf => .{ .def_cfa_offset_sf = try reader.takeLeb128(i64) },
+                .def_cfa_expression => .{ .def_cfa_expr = try reader.takeLeb128(usize) },
+
+                _ => switch (@intFromEnum(inst.low.extended)) {
+                    0x1C...0x3F => return error.UnimplementedUserOpcode,
+                    else => return error.InvalidOpcode,
+                },
+            },
+        };
+    }
+};
 
 const std = @import("../../../std.zig");
 const assert = std.debug.assert;
 const Allocator = std.mem.Allocator;
-const Dwarf = std.debug.Dwarf;
+const Unwind = std.debug.Dwarf.Unwind;
 
 const VirtualMachine = @This();

@@ -1,288 +0,0 @@
-const std = @import("../../std.zig");
-const Reader = std.Io.Reader;
-
-/// TODO merge with std.dwarf.CFA
-const Opcode = enum(u8) {
-    advance_loc = 0x1 << 6,
-    offset = 0x2 << 6,
-    restore = 0x3 << 6,
-
-    nop = 0x00,
-    set_loc = 0x01,
-    advance_loc1 = 0x02,
-    advance_loc2 = 0x03,
-    advance_loc4 = 0x04,
-    offset_extended = 0x05,
-    restore_extended = 0x06,
-    undefined = 0x07,
-    same_value = 0x08,
-    register = 0x09,
-    remember_state = 0x0a,
-    restore_state = 0x0b,
-    def_cfa = 0x0c,
-    def_cfa_register = 0x0d,
-    def_cfa_offset = 0x0e,
-    def_cfa_expression = 0x0f,
-    expression = 0x10,
-    offset_extended_sf = 0x11,
-    def_cfa_sf = 0x12,
-    def_cfa_offset_sf = 0x13,
-    val_offset = 0x14,
-    val_offset_sf = 0x15,
-    val_expression = 0x16,
-
-    // These opcodes encode an operand in the lower 6 bits of the opcode itself
-    pub const lo_inline = @intFromEnum(Opcode.advance_loc);
-    pub const hi_inline = @intFromEnum(Opcode.restore) | 0b111111;
-
-    // These opcodes are trailed by zero or more operands
-    pub const lo_reserved = @intFromEnum(Opcode.nop);
-    pub const hi_reserved = @intFromEnum(Opcode.val_expression);
-
-    // Vendor-specific opcodes
-    pub const lo_user = 0x1c;
-    pub const hi_user = 0x3f;
-};
-
-/// The returned slice points into `reader.buffer`.
-fn readBlock(reader: *Reader) ![]const u8 {
-    const block_len = try reader.takeLeb128(usize);
-    return reader.take(block_len) catch |err| switch (err) {
-        error.EndOfStream => return error.InvalidOperand,
-        error.ReadFailed => |e| return e,
-    };
-}
-
-pub const Instruction = union(Opcode) {
-    advance_loc: struct {
-        delta: u8,
-    },
-    offset: struct {
-        register: u8,
-        offset: u64,
-    },
-    restore: struct {
-        register: u8,
-    },
-    nop: void,
-    set_loc: struct {
-        address: u64,
-    },
-    advance_loc1: struct {
-        delta: u8,
-    },
-    advance_loc2: struct {
-        delta: u16,
-    },
-    advance_loc4: struct {
-        delta: u32,
-    },
-    offset_extended: struct {
-        register: u8,
-        offset: u64,
-    },
-    restore_extended: struct {
-        register: u8,
-    },
-    undefined: struct {
-        register: u8,
-    },
-    same_value: struct {
-        register: u8,
-    },
-    register: struct {
-        register: u8,
-        target_register: u8,
-    },
-    remember_state: void,
-    restore_state: void,
-    def_cfa: struct {
-        register: u8,
-        offset: u64,
-    },
-    def_cfa_register: struct {
-        register: u8,
-    },
-    def_cfa_offset: struct {
-        offset: u64,
-    },
-    def_cfa_expression: struct {
-        block: []const u8,
-    },
-    expression: struct {
-        register: u8,
-        block: []const u8,
-    },
-    offset_extended_sf: struct {
-        register: u8,
-        offset: i64,
-    },
-    def_cfa_sf: struct {
-        register: u8,
-        offset: i64,
-    },
-    def_cfa_offset_sf: struct {
-        offset: i64,
-    },
-    val_offset: struct {
-        register: u8,
-        offset: u64,
-    },
-    val_offset_sf: struct {
-        register: u8,
-        offset: i64,
-    },
-    val_expression: struct {
-        register: u8,
-        block: []const u8,
-    },
-
-    /// `reader` must be a `Reader.fixed` so that regions of its buffer are never invalidated.
-    pub fn read(
-        reader: *Reader,
-        addr_size_bytes: u8,
-        endian: std.builtin.Endian,
-    ) !Instruction {
-        switch (try reader.takeByte()) {
-            Opcode.lo_inline...Opcode.hi_inline => |opcode| {
-                const e: Opcode = @enumFromInt(opcode & 0b11000000);
-                const value: u6 = @intCast(opcode & 0b111111);
-                return switch (e) {
-                    .advance_loc => .{
-                        .advance_loc = .{ .delta = value },
-                    },
-                    .offset => .{
-                        .offset = .{
-                            .register = value,
-                            .offset = try reader.takeLeb128(u64),
-                        },
-                    },
-                    .restore => .{
-                        .restore = .{ .register = value },
-                    },
-                    else => unreachable,
-                };
-            },
-            Opcode.lo_reserved...Opcode.hi_reserved => |opcode| {
-                const e: Opcode = @enumFromInt(opcode);
-                return switch (e) {
-                    .advance_loc,
-                    .offset,
-                    .restore,
-                    => unreachable,
-                    .nop => .{ .nop = {} },
-                    .set_loc => .{ .set_loc = .{
-                        .address = switch (addr_size_bytes) {
-                            2 => try reader.takeInt(u16, endian),
-                            4 => try reader.takeInt(u32, endian),
-                            8 => try reader.takeInt(u64, endian),
-                            else => return error.UnsupportedAddrSize,
-                        },
-                    } },
-                    .advance_loc1 => .{
-                        .advance_loc1 = .{ .delta = try reader.takeByte() },
-                    },
-                    .advance_loc2 => .{
-                        .advance_loc2 = .{ .delta = try reader.takeInt(u16, endian) },
-                    },
-                    .advance_loc4 => .{
-                        .advance_loc4 = .{ .delta = try reader.takeInt(u32, endian) },
-                    },
-                    .offset_extended => .{
-                        .offset_extended = .{
-                            .register = try reader.takeLeb128(u8),
-                            .offset = try reader.takeLeb128(u64),
-                        },
-                    },
-                    .restore_extended => .{
-                        .restore_extended = .{
-                            .register = try reader.takeLeb128(u8),
-                        },
-                    },
-                    .undefined => .{
-                        .undefined = .{
-                            .register = try reader.takeLeb128(u8),
-                        },
-                    },
-                    .same_value => .{
-                        .same_value = .{
-                            .register = try reader.takeLeb128(u8),
-                        },
-                    },
-                    .register => .{
-                        .register = .{
-                            .register = try reader.takeLeb128(u8),
-                            .target_register = try reader.takeLeb128(u8),
-                        },
-                    },
-                    .remember_state => .{ .remember_state = {} },
-                    .restore_state => .{ .restore_state = {} },
-                    .def_cfa => .{
-                        .def_cfa = .{
-                            .register = try reader.takeLeb128(u8),
-                            .offset = try reader.takeLeb128(u64),
-                        },
-                    },
-                    .def_cfa_register => .{
-                        .def_cfa_register = .{
-                            .register = try reader.takeLeb128(u8),
-                        },
-                    },
-                    .def_cfa_offset => .{
-                        .def_cfa_offset = .{
-                            .offset = try reader.takeLeb128(u64),
-                        },
-                    },
-                    .def_cfa_expression => .{
-                        .def_cfa_expression = .{
-                            .block = try readBlock(reader),
-                        },
-                    },
-                    .expression => .{
-                        .expression = .{
-                            .register = try reader.takeLeb128(u8),
-                            .block = try readBlock(reader),
-                        },
-                    },
-                    .offset_extended_sf => .{
-                        .offset_extended_sf = .{
-                            .register = try reader.takeLeb128(u8),
-                            .offset = try reader.takeLeb128(i64),
-                        },
-                    },
-                    .def_cfa_sf => .{
-                        .def_cfa_sf = .{
-                            .register = try reader.takeLeb128(u8),
-                            .offset = try reader.takeLeb128(i64),
-                        },
-                    },
-                    .def_cfa_offset_sf => .{
-                        .def_cfa_offset_sf = .{
-                            .offset = try reader.takeLeb128(i64),
-                        },
-                    },
-                    .val_offset => .{
-                        .val_offset = .{
-                            .register = try reader.takeLeb128(u8),
-                            .offset = try reader.takeLeb128(u64),
-                        },
-                    },
-                    .val_offset_sf => .{
-                        .val_offset_sf = .{
-                            .register = try reader.takeLeb128(u8),
-                            .offset = try reader.takeLeb128(i64),
-                        },
-                    },
-                    .val_expression => .{
-                        .val_expression = .{
-                            .register = try reader.takeLeb128(u8),
-                            .block = try readBlock(reader),
-                        },
-                    },
-                };
-            },
-            Opcode.lo_user...Opcode.hi_user => return error.UnimplementedUserOpcode,
-            else => return error.InvalidOpcode,
-        }
-    }
-};

@@ -10,7 +10,7 @@
 //! The typical usage of `Unwind` is as follows:
 //!
 //! * Initialize with `initEhFrameHdr` or `initSection`, depending on the available data
-//! * Call `prepareLookup` to construct a search table if necessary
+//! * Call `prepare` to scan CIEs and, if necessary, construct a search table
 //! * Call `lookupPc` to find the section offset of the FDE corresponding to a PC
 //! * Call `getFde` to load the corresponding FDE and CIE
 //! * Check that the PC does indeed fall in that range (`lookupPc` may return a false positive)
@@ -18,7 +18,7 @@
 //!
 //! In some cases, such as when using the "compact unwind" data in Mach-O binaries, the FDE offsets
 //! may already be known. In that case, no call to `lookupPc` is necessary, which means the call to
-//! `prepareLookup` can also be omitted.
+//! `prepare` can be optimized to only scan CIEs.
 
 pub const VirtualMachine = @import("Unwind/VirtualMachine.zig");
 
@@ -45,7 +45,7 @@ frame_section: struct {
 
 /// A structure allowing fast lookups of the FDE corresponding to a particular PC. We use a binary
 /// search table for the lookup; essentially, a list of all FDEs ordered by PC range. `null` means
-/// the lookup data is not yet populated, so `prepareLookup` must be called before `lookupPc`.
+/// the lookup data is not yet populated, so `prepare` must be called before `lookupPc`.
 lookup: ?union(enum) {
     /// The `.eh_frame_hdr` section contains a pre-computed search table which we can use.
     eh_frame_hdr: struct {
@@ -58,6 +58,12 @@ lookup: ?union(enum) {
     sorted_fdes: []SortedFdeEntry,
 },
 
+/// Initially empty; populated by `prepare`.
+cie_list: std.MultiArrayList(struct {
+    offset: u64,
+    cie: CommonInformationEntry,
+}),
+
 const SortedFdeEntry = struct {
     /// This FDE's value of `pc_begin`.
     pc_begin: u64,
@@ -83,6 +89,7 @@ pub fn initEhFrameHdr(header: EhFrameHeader, section_vaddr: u64, section_bytes_p
             .vaddr = section_vaddr,
             .table = table,
         } } else null,
+        .cie_list = .empty,
     };
 }
 
@@ -98,16 +105,21 @@ pub fn initSection(section: Section, section_vaddr: u64, section_bytes: []const
             .vaddr = section_vaddr,
         },
         .lookup = null,
+        .cie_list = .empty,
     };
 }
 
-/// Technically, it is only necessary to call this if `prepareLookup` has previously been called,
-/// since no other function here allocates resources.
 pub fn deinit(unwind: *Unwind, gpa: Allocator) void {
     if (unwind.lookup) |lookup| switch (lookup) {
         .eh_frame_hdr => {},
         .sorted_fdes => |fdes| gpa.free(fdes),
     };
+    for (unwind.cie_list.items(.cie)) |*cie| {
+        if (cie.last_row) |*lr| {
+            gpa.free(lr.cols);
+        }
+    }
+    unwind.cie_list.deinit(gpa);
 }
 
 /// Decoded version of the `.eh_frame_hdr` section.
@@ -236,7 +248,6 @@ const EntryHeader = union(enum) {
         bytes_len: u64,
     },
     fde: struct {
-        format: Format,
         /// Offset into the section of the corresponding CIE, *including* its entry header.
         cie_offset: u64,
         /// Remaining bytes in the FDE. These are parseable by `FrameDescriptionEntry.parse`.
@@ -290,7 +301,6 @@ const EntryHeader = union(enum) {
             .debug_frame => cie_ptr_or_id,
         };
         return .{ .fde = .{
-            .format = unit_header.format,
             .cie_offset = cie_offset,
             .bytes_len = remaining_bytes,
         } };
@@ -299,6 +309,7 @@ const EntryHeader = union(enum) {
 
 pub const CommonInformationEntry = struct {
     version: u8,
+    format: Format,
 
     /// In version 4, CIEs can specify the address size used in the CIE and associated FDEs.
     /// This value must be used *only* to parse associated FDEs in `FrameDescriptionEntry.parse`.
@@ -318,6 +329,12 @@ pub const CommonInformationEntry = struct {
 
     initial_instructions: []const u8,
 
+    last_row: ?struct {
+        offset: u64,
+        cfa: VirtualMachine.CfaRule,
+        cols: []VirtualMachine.Column,
+    },
+
     pub const AugmentationKind = enum { none, gcc_eh, lsb_z };
 
     /// This function expects to read the CIE starting with the version field.
@@ -326,6 +343,7 @@ pub const CommonInformationEntry = struct {
     /// `length_offset` specifies the offset of this CIE's length field in the
     /// .eh_frame / .debug_frame section.
     fn parse(
+        format: Format,
         cie_bytes: []const u8,
         section: Section,
         default_addr_size_bytes: u8,
@@ -384,6 +402,7 @@ pub const CommonInformationEntry = struct {
         };
 
         return .{
+            .format = format,
             .version = version,
             .addr_size_bytes = addr_size_bytes,
             .segment_selector_size = segment_selector_size,
@@ -394,6 +413,7 @@ pub const CommonInformationEntry = struct {
             .is_signal_frame = is_signal_frame,
             .augmentation_kind = aug_kind,
             .initial_instructions = r.buffered(),
+            .last_row = null,
         };
     }
 };
@@ -411,7 +431,7 @@ pub const FrameDescriptionEntry = struct {
         /// module's `.eh_frame` section, this will equal `fde_bytes.ptr`.
         fde_vaddr: u64,
         fde_bytes: []const u8,
-        cie: CommonInformationEntry,
+        cie: *const CommonInformationEntry,
         endian: Endian,
     ) !FrameDescriptionEntry {
         if (cie.segment_selector_size != 0) return error.UnsupportedAddrSize;
@@ -446,11 +466,18 @@ pub const FrameDescriptionEntry = struct {
     }
 };
 
-/// Builds the PC FDE lookup table if it is not already built. It is required to call this function
-/// at least once before calling `lookupPc`. Once this function is called, memory has been allocated
-/// and so `deinit` (matching this `gpa`) is required to free it.
-pub fn prepareLookup(unwind: *Unwind, gpa: Allocator, addr_size_bytes: u8, endian: Endian) !void {
-    if (unwind.lookup != null) return;
+/// Builds the CIE list and FDE lookup table if they are not already built. It is required to call
+/// this function at least once before calling `lookupPc` or `getFde`. If only `getFde` is needed,
+/// then `need_lookup` can be set to `false` to make this function more efficient.
+pub fn prepare(
+    unwind: *Unwind,
+    gpa: Allocator,
+    addr_size_bytes: u8,
+    endian: Endian,
+    need_lookup: bool,
+) !void {
+    if (unwind.cie_list.len > 0 and (!need_lookup or unwind.lookup != null)) return;
+    unwind.cie_list.clearRetainingCapacity();
 
     const section = unwind.frame_section;
 
@@ -462,21 +489,28 @@ pub fn prepareLookup(unwind: *Unwind, gpa: Allocator, addr_size_bytes: u8, endia
         const entry_offset = r.seek;
         switch (try EntryHeader.read(&r, entry_offset, section.id, endian)) {
             .cie => |cie_info| {
-                // Ignore CIEs for now; we'll parse them when we read a corresponding FDE
-                try r.discardAll(cast(usize, cie_info.bytes_len) orelse return error.EndOfStream);
+                // We will pre-populate a list of CIEs for efficiency: this avoids work re-parsing
+                // them every time we look up an FDE. It also lets us cache the result of evaluating
+                // the CIE's initial CFI instructions, which is useful because in the vast majority
+                // of cases those instructions will be needed to reach the PC we are unwinding to.
+                const bytes_len = cast(usize, cie_info.bytes_len) orelse return error.EndOfStream;
+                const idx = unwind.cie_list.len;
+                try unwind.cie_list.append(gpa, .{
+                    .offset = entry_offset,
+                    .cie = try .parse(cie_info.format, try r.take(bytes_len), section.id, addr_size_bytes),
+                });
+                errdefer _ = unwind.cie_list.pop().?;
+                try VirtualMachine.populateCieLastRow(gpa, &unwind.cie_list.items(.cie)[idx], addr_size_bytes, endian);
                 continue;
             },
             .fde => |fde_info| {
-                if (fde_info.cie_offset > section.bytes.len) return error.EndOfStream;
-                var cie_r: Reader = .fixed(section.bytes[@intCast(fde_info.cie_offset)..]);
-                const cie_info = switch (try EntryHeader.read(&cie_r, fde_info.cie_offset, section.id, endian)) {
-                    .cie => |cie_info| cie_info,
-                    .fde, .terminator => return bad(), // this is meant to be a CIE
-                };
-                const cie_bytes_len = cast(usize, cie_info.bytes_len) orelse return error.EndOfStream;
-                const fde_bytes_len = cast(usize, fde_info.bytes_len) orelse return error.EndOfStream;
-                const cie: CommonInformationEntry = try .parse(try cie_r.take(cie_bytes_len), section.id, addr_size_bytes);
-                const fde: FrameDescriptionEntry = try .parse(section.vaddr + r.seek, try r.take(fde_bytes_len), cie, endian);
+                const bytes_len = cast(usize, fde_info.bytes_len) orelse return error.EndOfStream;
+                if (!need_lookup) {
+                    try r.discardAll(bytes_len);
+                    continue;
+                }
+                const cie = unwind.findCie(fde_info.cie_offset) orelse return error.InvalidDebugInfo;
+                const fde: FrameDescriptionEntry = try .parse(section.vaddr + r.seek, try r.take(bytes_len), cie, endian);
                 try fde_list.append(gpa, .{
                     .pc_begin = fde.pc_begin,
                     .fde_offset = entry_offset,
@@ -502,12 +536,30 @@ pub fn prepareLookup(unwind: *Unwind, gpa: Allocator, addr_size_bytes: u8, endia
     unwind.lookup = .{ .sorted_fdes = final_fdes };
 }
 
+fn findCie(unwind: *const Unwind, offset: u64) ?*const CommonInformationEntry {
+    const offsets = unwind.cie_list.items(.offset);
+    if (offsets.len == 0) return null;
+    var start: usize = 0;
+    var len: usize = offsets.len;
+    while (len > 1) {
+        const mid = len / 2;
+        if (offset < offsets[start + mid]) {
+            len = mid;
+        } else {
+            start += mid;
+            len -= mid;
+        }
+    }
+    if (offsets[start] != offset) return null;
+    return &unwind.cie_list.items(.cie)[start];
+}
+
 /// Given a program counter value, returns the offset of the corresponding FDE, or `null` if no
 /// matching FDE was found. The returned offset can be passed to `getFde` to load the data
 /// associated with the FDE.
 ///
-/// Before calling this function, `prepareLookup` must return successfully at least once, to ensure
-/// that `unwind.lookup` is populated.
+/// Before calling this function, `prepare` must return successfully at least once, to ensure that
+/// `unwind.lookup` is populated.
 ///
 /// The return value may be a false positive. After loading the FDE with `loadFde`, the caller must
 /// validate that `pc` is indeed in its range -- if it is not, then no FDE matches `pc`.
@@ -524,20 +576,25 @@ pub fn lookupPc(unwind: *const Unwind, pc: u64, addr_size_bytes: u8, endian: End
         },
         .sorted_fdes => |sorted_fdes| sorted_fdes,
     };
-    const first_bad_idx = std.sort.partitionPoint(SortedFdeEntry, sorted_fdes, pc, struct {
-        fn canIncludePc(target_pc: u64, entry: SortedFdeEntry) bool {
-            return target_pc >= entry.pc_begin; // i.e. does 'entry_pc..<last pc>' include 'target_pc'
+    if (sorted_fdes.len == 0) return null;
+    var start: usize = 0;
+    var len: usize = sorted_fdes.len;
+    while (len > 1) {
+        const half = len / 2;
+        if (pc < sorted_fdes[start + half].pc_begin) {
+            len = half;
+        } else {
+            start += half;
+            len -= half;
         }
-    }.canIncludePc);
-    // `first_bad_idx` is the index of the first FDE whose `pc_begin` is too high to include `pc`.
-    // So if any FDE matches, it'll be the one at `first_bad_idx - 1` (maybe false positive).
-    if (first_bad_idx == 0) return null;
-    return sorted_fdes[first_bad_idx - 1].fde_offset;
+    }
+    // If any FDE matches, it'll be the one at `start` (maybe false positive).
+    return sorted_fdes[start].fde_offset;
 }
 
 /// Get the FDE at a given offset, as well as its associated CIE. This offset typically comes from
 /// `lookupPc`. The CFI instructions within can be evaluated with `VirtualMachine`.
-pub fn getFde(unwind: *const Unwind, fde_offset: u64, addr_size_bytes: u8, endian: Endian) !struct { Format, CommonInformationEntry, FrameDescriptionEntry } {
+pub fn getFde(unwind: *const Unwind, fde_offset: u64, endian: Endian) !struct { *const CommonInformationEntry, FrameDescriptionEntry } {
     const section = unwind.frame_section;
 
     if (fde_offset > section.bytes.len) return error.EndOfStream;
@@ -547,19 +604,7 @@ pub fn getFde(unwind: *const Unwind, fde_offset: u64, addr_size_bytes: u8, endia
         .cie, .terminator => return bad(), // This is meant to be an FDE
     };
 
-    const cie_offset = fde_info.cie_offset;
-    if (cie_offset > section.bytes.len) return error.EndOfStream;
-    var cie_reader: Reader = .fixed(section.bytes[@intCast(cie_offset)..]);
-    const cie_info = switch (try EntryHeader.read(&cie_reader, cie_offset, section.id, endian)) {
-        .cie => |info| info,
-        .fde, .terminator => return bad(), // This is meant to be a CIE
-    };
-
-    const cie: CommonInformationEntry = try .parse(
-        try cie_reader.take(cast(usize, cie_info.bytes_len) orelse return error.EndOfStream),
-        section.id,
-        addr_size_bytes,
-    );
+    const cie = unwind.findCie(fde_info.cie_offset) orelse return error.InvalidDebugInfo;
     const fde: FrameDescriptionEntry = try .parse(
         section.vaddr + fde_offset + fde_reader.seek,
         try fde_reader.take(cast(usize, fde_info.bytes_len) orelse return error.EndOfStream),
@@ -567,7 +612,7 @@ pub fn getFde(unwind: *const Unwind, fde_offset: u64, addr_size_bytes: u8, endia
         endian,
     );
 
-    return .{ cie_info.format, cie, fde };
+    return .{ cie, fde };
 }
 
 const EhPointerContext = struct {

@@ -20,7 +20,7 @@ pub fn lookup(cache: *LookupCache, gpa: Allocator, address: usize) Error!DarwinM
         },
     }
 }
-fn loadUnwindInfo(module: *const DarwinModule) DebugInfo.Unwind {
+fn loadUnwindInfo(module: *const DarwinModule, gpa: Allocator, out: *DebugInfo) !void {
     const header: *std.macho.mach_header = @ptrFromInt(module.text_base);
 
     var it: macho.LoadCommandIterator = .{
@@ -36,21 +36,57 @@ fn loadUnwindInfo(module: *const DarwinModule) DebugInfo.Unwind {
 
     const vmaddr_slide = module.text_base - text_vmaddr;
 
-    var unwind_info: ?[]const u8 = null;
-    var eh_frame: ?[]const u8 = null;
+    var opt_unwind_info: ?[]const u8 = null;
+    var opt_eh_frame: ?[]const u8 = null;
     for (sections) |sect| {
         if (mem.eql(u8, sect.sectName(), "__unwind_info")) {
             const sect_ptr: [*]u8 = @ptrFromInt(@as(usize, @intCast(vmaddr_slide + sect.addr)));
-            unwind_info = sect_ptr[0..@intCast(sect.size)];
+            opt_unwind_info = sect_ptr[0..@intCast(sect.size)];
         } else if (mem.eql(u8, sect.sectName(), "__eh_frame")) {
             const sect_ptr: [*]u8 = @ptrFromInt(@as(usize, @intCast(vmaddr_slide + sect.addr)));
-            eh_frame = sect_ptr[0..@intCast(sect.size)];
+            opt_eh_frame = sect_ptr[0..@intCast(sect.size)];
         }
     }
-    return .{
+    const eh_frame = opt_eh_frame orelse {
+        out.unwind = .{
+            .vmaddr_slide = vmaddr_slide,
+            .unwind_info = opt_unwind_info,
+            .dwarf = null,
+            .dwarf_cache = undefined,
+        };
+        return;
+    };
+    var dwarf: Dwarf.Unwind = .initSection(.eh_frame, @intFromPtr(eh_frame.ptr) - vmaddr_slide, eh_frame);
+    errdefer dwarf.deinit(gpa);
+    // We don't need lookups, so this call is just for scanning CIEs.
+    dwarf.prepare(gpa, @sizeOf(usize), native_endian, false) catch |err| switch (err) {
+        error.ReadFailed => unreachable, // it's all fixed buffers
+        error.InvalidDebugInfo,
+        error.MissingDebugInfo,
+        error.OutOfMemory,
+        => |e| return e,
+        error.EndOfStream,
+        error.Overflow,
+        error.StreamTooLong,
+        error.InvalidOperand,
+        error.InvalidOpcode,
+        error.InvalidOperation,
+        => return error.InvalidDebugInfo,
+        error.UnsupportedAddrSize,
+        error.UnsupportedDwarfVersion,
+        error.UnimplementedUserOpcode,
+        => return error.UnsupportedDebugInfo,
+    };
+
+    const dwarf_cache = try gpa.create(UnwindContext.Cache);
+    errdefer gpa.destroy(dwarf_cache);
+    dwarf_cache.init();
+
+    out.unwind = .{
         .vmaddr_slide = vmaddr_slide,
-        .unwind_info = unwind_info,
-        .eh_frame = eh_frame,
+        .unwind_info = opt_unwind_info,
+        .dwarf = dwarf,
+        .dwarf_cache = dwarf_cache,
     };
 }
 fn loadMachO(module: *const DarwinModule, gpa: Allocator) !DebugInfo.LoadedMachO {
@@ -350,10 +386,10 @@ pub fn unwindFrame(module: *const DarwinModule, gpa: Allocator, di: *DebugInfo,
     };
 }
 fn unwindFrameInner(module: *const DarwinModule, gpa: Allocator, di: *DebugInfo, context: *UnwindContext) !usize {
-    const unwind: *const DebugInfo.Unwind = u: {
+    const unwind: *DebugInfo.Unwind = u: {
         di.mutex.lock();
         defer di.mutex.unlock();
-        if (di.unwind == null) di.unwind = module.loadUnwindInfo();
+        if (di.unwind == null) try module.loadUnwindInfo(gpa, di);
         break :u &di.unwind.?;
     };
 
@@ -580,14 +616,8 @@ fn unwindFrameInner(module: *const DarwinModule, gpa: Allocator, di: *DebugInfo,
                 break :ip new_ip;
             },
             .DWARF => {
-                const eh_frame = unwind.eh_frame orelse return error.MissingDebugInfo;
-                const eh_frame_vaddr = @intFromPtr(eh_frame.ptr) - unwind.vmaddr_slide;
-                return context.unwindFrame(
-                    gpa,
-                    &.initSection(.eh_frame, eh_frame_vaddr, eh_frame),
-                    unwind.vmaddr_slide,
-                    @intCast(encoding.value.x86_64.dwarf),
-                );
+                const dwarf = &(unwind.dwarf orelse return error.MissingDebugInfo);
+                return context.unwindFrame(unwind.dwarf_cache, gpa, dwarf, unwind.vmaddr_slide, encoding.value.x86_64.dwarf);
             },
         },
         .aarch64, .aarch64_be => switch (encoding.mode.arm64) {
@@ -600,14 +630,8 @@ fn unwindFrameInner(module: *const DarwinModule, gpa: Allocator, di: *DebugInfo,
                 break :ip new_ip;
             },
             .DWARF => {
-                const eh_frame = unwind.eh_frame orelse return error.MissingDebugInfo;
-                const eh_frame_vaddr = @intFromPtr(eh_frame.ptr) - unwind.vmaddr_slide;
-                return context.unwindFrame(
-                    gpa,
-                    &.initSection(.eh_frame, eh_frame_vaddr, eh_frame),
-                    unwind.vmaddr_slide,
-                    @intCast(encoding.value.x86_64.dwarf),
-                );
+                const dwarf = &(unwind.dwarf orelse return error.MissingDebugInfo);
+                return context.unwindFrame(unwind.dwarf_cache, gpa, dwarf, unwind.vmaddr_slide, encoding.value.arm64.dwarf);
             },
             .FRAME => ip: {
                 const frame = encoding.value.arm64.frame;
@@ -691,12 +715,15 @@ pub const DebugInfo = struct {
     }
 
     const Unwind = struct {
-        /// The slide applied to the following sections. So, `unwind_info.ptr` is this many bytes
-        /// higher than the vmaddr of `__unwind_info`, and likewise for `__eh_frame`.
+        /// The slide applied to the `__unwind_info` and `__eh_frame` sections.
+        /// So, `unwind_info.ptr` is this many bytes higher than the section's vmaddr.
         vmaddr_slide: u64,
-        // Backed by the in-memory sections mapped by the loader
+        /// Backed by the in-memory section mapped by the loader.
         unwind_info: ?[]const u8,
-        eh_frame: ?[]const u8,
+        /// Backed by the in-memory `__eh_frame` section mapped by the loader.
+        dwarf: ?Dwarf.Unwind,
+        /// This is `undefined` if `dwarf == null`.
+        dwarf_cache: *UnwindContext.Cache,
     };
 
     const LoadedMachO = struct {

@@ -3,8 +3,22 @@ name: []const u8,
 build_id: ?[]const u8,
 gnu_eh_frame: ?[]const u8,
 
-/// No cache needed, because `dl_iterate_phdr` is already fast.
-pub const LookupCache = void;
+pub const LookupCache = struct {
+    rwlock: std.Thread.RwLock,
+    ranges: std.ArrayList(Range),
+    const Range = struct {
+        start: usize,
+        len: usize,
+        mod: ElfModule,
+    };
+    pub const init: LookupCache = .{
+        .rwlock = .{},
+        .ranges = .empty,
+    };
+    pub fn deinit(lc: *LookupCache, gpa: Allocator) void {
+        lc.ranges.deinit(gpa);
+    }
+};
 
 pub const DebugInfo = struct {
     /// Held while checking and/or populating `loaded_elf`/`scanned_dwarf`/`unwind`.
@@ -14,18 +28,24 @@ pub const DebugInfo = struct {
 
     loaded_elf: ?ElfFile,
     scanned_dwarf: bool,
-    unwind: [2]?Dwarf.Unwind,
+    unwind: if (supports_unwinding) [2]?Dwarf.Unwind else void,
+    unwind_cache: if (supports_unwinding) *UnwindContext.Cache else void,
+
     pub const init: DebugInfo = .{
         .mutex = .{},
         .loaded_elf = null,
         .scanned_dwarf = false,
-        .unwind = @splat(null),
+        .unwind = if (supports_unwinding) @splat(null),
+        .unwind_cache = undefined,
     };
     pub fn deinit(di: *DebugInfo, gpa: Allocator) void {
         if (di.loaded_elf) |*loaded_elf| loaded_elf.deinit(gpa);
-        for (&di.unwind) |*opt_unwind| {
-            const unwind = &(opt_unwind.* orelse continue);
-            unwind.deinit(gpa);
+        if (supports_unwinding) {
+            if (di.unwind[0] != null) gpa.destroy(di.unwind_cache);
+            for (&di.unwind) |*opt_unwind| {
+                const unwind = &(opt_unwind.* orelse continue);
+                unwind.deinit(gpa);
+            }
         }
     }
 };
@@ -34,75 +54,84 @@ pub fn key(m: ElfModule) usize {
     return m.load_offset;
 }
 pub fn lookup(cache: *LookupCache, gpa: Allocator, address: usize) Error!ElfModule {
-    _ = cache;
-    _ = gpa;
-    const DlIterContext = struct {
-        /// input
-        address: usize,
-        /// output
-        module: ElfModule,
+    if (lookupInCache(cache, address)) |m| return m;
 
-        fn callback(info: *std.posix.dl_phdr_info, size: usize, context: *@This()) !void {
-            _ = size;
-            // The base address is too high
-            if (context.address < info.addr)
-                return;
+    {
+        // Check a new module hasn't been loaded
+        cache.rwlock.lock();
+        defer cache.rwlock.unlock();
+        const DlIterContext = struct {
+            ranges: *std.ArrayList(LookupCache.Range),
+            gpa: Allocator,
 
-            const phdrs = info.phdr[0..info.phnum];
-            for (phdrs) |*phdr| {
-                if (phdr.p_type != elf.PT_LOAD) continue;
+            fn callback(info: *std.posix.dl_phdr_info, size: usize, context: *@This()) !void {
+                _ = size;
 
-                // Overflowing addition is used to handle the case of VSDOs having a p_vaddr = 0xffffffffff700000
-                const seg_start = info.addr +% phdr.p_vaddr;
-                const seg_end = seg_start + phdr.p_memsz;
-                if (context.address >= seg_start and context.address < seg_end) {
-                    context.module = .{
-                        .load_offset = info.addr,
-                        // Android libc uses NULL instead of "" to mark the main program
-                        .name = mem.sliceTo(info.name, 0) orelse "",
-                        .build_id = null,
-                        .gnu_eh_frame = null,
-                    };
-                    break;
+                var mod: ElfModule = .{
+                    .load_offset = info.addr,
+                    // Android libc uses NULL instead of "" to mark the main program
+                    .name = mem.sliceTo(info.name, 0) orelse "",
+                    .build_id = null,
+                    .gnu_eh_frame = null,
+                };
+
+                // Populate `build_id` and `gnu_eh_frame`
+                for (info.phdr[0..info.phnum]) |phdr| {
+                    switch (phdr.p_type) {
+                        elf.PT_NOTE => {
+                            // Look for .note.gnu.build-id
+                            const segment_ptr: [*]const u8 = @ptrFromInt(info.addr + phdr.p_vaddr);
+                            var r: std.Io.Reader = .fixed(segment_ptr[0..phdr.p_memsz]);
+                            const name_size = r.takeInt(u32, native_endian) catch continue;
+                            const desc_size = r.takeInt(u32, native_endian) catch continue;
+                            const note_type = r.takeInt(u32, native_endian) catch continue;
+                            const name = r.take(name_size) catch continue;
+                            if (note_type != elf.NT_GNU_BUILD_ID) continue;
+                            if (!mem.eql(u8, name, "GNU\x00")) continue;
+                            const desc = r.take(desc_size) catch continue;
+                            mod.build_id = desc;
+                        },
+                        elf.PT_GNU_EH_FRAME => {
+                            const segment_ptr: [*]const u8 = @ptrFromInt(info.addr + phdr.p_vaddr);
+                            mod.gnu_eh_frame = segment_ptr[0..phdr.p_memsz];
+                        },
+                        else => {},
+                    }
                 }
-            } else return;
 
-            for (info.phdr[0..info.phnum]) |phdr| {
-                switch (phdr.p_type) {
-                    elf.PT_NOTE => {
-                        // Look for .note.gnu.build-id
-                        const segment_ptr: [*]const u8 = @ptrFromInt(info.addr + phdr.p_vaddr);
-                        var r: std.Io.Reader = .fixed(segment_ptr[0..phdr.p_memsz]);
-                        const name_size = r.takeInt(u32, native_endian) catch continue;
-                        const desc_size = r.takeInt(u32, native_endian) catch continue;
-                        const note_type = r.takeInt(u32, native_endian) catch continue;
-                        const name = r.take(name_size) catch continue;
-                        if (note_type != elf.NT_GNU_BUILD_ID) continue;
-                        if (!mem.eql(u8, name, "GNU\x00")) continue;
-                        const desc = r.take(desc_size) catch continue;
-                        context.module.build_id = desc;
-                    },
-                    elf.PT_GNU_EH_FRAME => {
-                        const segment_ptr: [*]const u8 = @ptrFromInt(info.addr + phdr.p_vaddr);
-                        context.module.gnu_eh_frame = segment_ptr[0..phdr.p_memsz];
-                    },
-                    else => {},
+                // Now that `mod` is populated, create the ranges
+                for (info.phdr[0..info.phnum]) |phdr| {
+                    if (phdr.p_type != elf.PT_LOAD) continue;
+                    try context.ranges.append(context.gpa, .{
+                        // Overflowing addition handles VSDOs having p_vaddr = 0xffffffffff700000
+                        .start = info.addr +% phdr.p_vaddr,
+                        .len = phdr.p_memsz,
+                        .mod = mod,
+                    });
                 }
             }
+        };
+        cache.ranges.clearRetainingCapacity();
+        var ctx: DlIterContext = .{
+            .ranges = &cache.ranges,
+            .gpa = gpa,
+        };
+        try std.posix.dl_iterate_phdr(&ctx, error{OutOfMemory}, DlIterContext.callback);
+    }
 
-            // Stop the iteration
-            return error.Found;
-        }
-    };
-    var ctx: DlIterContext = .{
-        .address = address,
-        .module = undefined,
-    };
-    std.posix.dl_iterate_phdr(&ctx, error{Found}, DlIterContext.callback) catch |err| switch (err) {
-        error.Found => return ctx.module,
-    };
+    if (lookupInCache(cache, address)) |m| return m;
     return error.MissingDebugInfo;
 }
+fn lookupInCache(cache: *LookupCache, address: usize) ?ElfModule {
+    cache.rwlock.lockShared();
+    defer cache.rwlock.unlockShared();
+    for (cache.ranges.items) |*range| {
+        if (address >= range.start and address < range.start + range.len) {
+            return range.mod;
+        }
+    }
+    return null;
+}
 fn loadElf(module: *const ElfModule, gpa: Allocator, di: *DebugInfo) Error!void {
     std.debug.assert(di.loaded_elf == null);
     std.debug.assert(!di.scanned_dwarf);
@@ -199,11 +228,23 @@ pub fn getSymbolAtAddress(module: *const ElfModule, gpa: Allocator, di: *DebugIn
     };
 }
 fn prepareUnwindLookup(unwind: *Dwarf.Unwind, gpa: Allocator) Error!void {
-    unwind.prepareLookup(gpa, @sizeOf(usize), native_endian) catch |err| switch (err) {
+    unwind.prepare(gpa, @sizeOf(usize), native_endian, true) catch |err| switch (err) {
         error.ReadFailed => unreachable, // it's all fixed buffers
-        error.InvalidDebugInfo, error.MissingDebugInfo, error.OutOfMemory => |e| return e,
-        error.EndOfStream, error.Overflow, error.StreamTooLong => return error.InvalidDebugInfo,
-        error.UnsupportedAddrSize, error.UnsupportedDwarfVersion => return error.UnsupportedDebugInfo,
+        error.InvalidDebugInfo,
+        error.MissingDebugInfo,
+        error.OutOfMemory,
+        => |e| return e,
+        error.EndOfStream,
+        error.Overflow,
+        error.StreamTooLong,
+        error.InvalidOperand,
+        error.InvalidOpcode,
+        error.InvalidOperation,
+        => return error.InvalidDebugInfo,
+        error.UnsupportedAddrSize,
+        error.UnsupportedDwarfVersion,
+        error.UnimplementedUserOpcode,
+        => return error.UnsupportedDebugInfo,
     };
 }
 fn loadUnwindInfo(module: *const ElfModule, gpa: Allocator, di: *DebugInfo) Error!void {
@@ -240,12 +281,18 @@ fn loadUnwindInfo(module: *const ElfModule, gpa: Allocator, di: *DebugInfo) Erro
     };
     errdefer for (unwinds) |*u| u.deinit(gpa);
     for (unwinds) |*u| try prepareUnwindLookup(u, gpa);
+
+    const unwind_cache = try gpa.create(UnwindContext.Cache);
+    errdefer gpa.destroy(unwind_cache);
+    unwind_cache.init();
+
     switch (unwinds.len) {
         0 => unreachable,
         1 => di.unwind = .{ unwinds[0], null },
         2 => di.unwind = .{ unwinds[0], unwinds[1] },
         else => unreachable,
     }
+    di.unwind_cache = unwind_cache;
 }
 pub fn unwindFrame(module: *const ElfModule, gpa: Allocator, di: *DebugInfo, context: *UnwindContext) Error!usize {
     const unwinds: *const [2]?Dwarf.Unwind = u: {
@@ -257,7 +304,7 @@ pub fn unwindFrame(module: *const ElfModule, gpa: Allocator, di: *DebugInfo, con
     };
     for (unwinds) |*opt_unwind| {
         const unwind = &(opt_unwind.* orelse break);
-        return context.unwindFrame(gpa, unwind, module.load_offset, null) catch |err| switch (err) {
+        return context.unwindFrame(di.unwind_cache, gpa, unwind, module.load_offset, null) catch |err| switch (err) {
             error.MissingDebugInfo => continue, // try the next one
             else => |e| return e,
         };

@@ -27,7 +27,6 @@ const Reader = std.Io.Reader;
 const Dwarf = @This();
 
 pub const expression = @import("Dwarf/expression.zig");
-pub const call_frame = @import("Dwarf/call_frame.zig");
 pub const Unwind = @import("Dwarf/Unwind.zig");
 
 /// Useful to temporarily enable while working on this file.

@@ -207,6 +207,36 @@ pub const DwarfUnwindContext = struct {
     vm: Dwarf.Unwind.VirtualMachine,
     stack_machine: Dwarf.expression.StackMachine(.{ .call_frame_context = true }),
 
+    pub const Cache = struct {
+        /// TODO: to allow `DwarfUnwindContext` to work on freestanding, we currently just don't use
+        /// this mutex there. That's a bad solution, but a better one depends on the standard
+        /// library's general support for "bring your own OS" being improved.
+        mutex: switch (builtin.os.tag) {
+            else => std.Thread.Mutex,
+            .freestanding, .other => struct {
+                fn lock(_: @This()) void {}
+                fn unlock(_: @This()) void {}
+            },
+        },
+        buf: [num_slots]Slot,
+        const num_slots = 2048;
+        const Slot = struct {
+            const max_regs = 32;
+            pc: usize,
+            cie: *const Dwarf.Unwind.CommonInformationEntry,
+            cfa_rule: Dwarf.Unwind.VirtualMachine.CfaRule,
+            rules_regs: [max_regs]u16,
+            rules: [max_regs]Dwarf.Unwind.VirtualMachine.RegisterRule,
+            num_rules: u8,
+        };
+        /// This is a function rather than a declaration to avoid lowering a very large struct value
+        /// into the binary when most of it is `undefined`.
+        pub fn init(c: *Cache) void {
+            c.mutex = .{};
+            for (&c.buf) |*slot| slot.pc = 0;
+        }
+    };
+
     pub fn init(cpu_context: *const CpuContext) DwarfUnwindContext {
         comptime assert(supports_unwinding);
 
@@ -243,126 +273,30 @@ pub const DwarfUnwindContext = struct {
         return ptr.*;
     }
 
-    /// The default rule is typically equivalent to `.undefined`, but ABIs may define it differently.
-    fn defaultRuleBehavior(register: u8) enum { undefined, same_value } {
-        if (builtin.cpu.arch.isAARCH64() and register >= 19 and register <= 28) {
-            // The default rule for callee-saved registers on AArch64 acts like the `.same_value` rule
-            return .same_value;
-        }
-        return .undefined;
-    }
-
-    /// Resolves the register rule and places the result into `out` (see regBytes). Returns `true`
-    /// iff the rule was undefined. This is *not* the same as `col.rule == .undefined`, because the
-    /// default rule may be undefined.
-    pub fn resolveRegisterRule(
-        context: *DwarfUnwindContext,
-        gpa: Allocator,
-        col: Dwarf.Unwind.VirtualMachine.Column,
-        expression_context: std.debug.Dwarf.expression.Context,
-        out: []u8,
-    ) !bool {
-        switch (col.rule) {
-            .default => {
-                const register = col.register orelse return error.InvalidRegister;
-                switch (defaultRuleBehavior(register)) {
-                    .undefined => {
-                        @memset(out, undefined);
-                        return true;
-                    },
-                    .same_value => {
-                        const src = try context.cpu_context.dwarfRegisterBytes(register);
-                        if (src.len != out.len) return error.RegisterSizeMismatch;
-                        @memcpy(out, src);
-                        return false;
-                    },
-                }
-            },
-            .undefined => {
-                @memset(out, undefined);
-                return true;
-            },
-            .same_value => {
-                // TODO: This copy could be eliminated if callers always copy the state then call this function to update it
-                const register = col.register orelse return error.InvalidRegister;
-                const src = try context.cpu_context.dwarfRegisterBytes(register);
-                if (src.len != out.len) return error.RegisterSizeMismatch;
-                @memcpy(out, src);
-                return false;
-            },
-            .offset => |offset| {
-                const cfa = context.cfa orelse return error.InvalidCFA;
-                const addr = try applyOffset(cfa, offset);
-                const ptr: *const usize = @ptrFromInt(addr);
-                mem.writeInt(usize, out[0..@sizeOf(usize)], ptr.*, native_endian);
-                return false;
-            },
-            .val_offset => |offset| {
-                const cfa = context.cfa orelse return error.InvalidCFA;
-                mem.writeInt(usize, out[0..@sizeOf(usize)], try applyOffset(cfa, offset), native_endian);
-                return false;
-            },
-            .register => |register| {
-                const src = try context.cpu_context.dwarfRegisterBytes(register);
-                if (src.len != out.len) return error.RegisterSizeMismatch;
-                @memcpy(out, src);
-                return false;
-            },
-            .expression => |expression| {
-                context.stack_machine.reset();
-                const value = try context.stack_machine.run(
-                    expression,
-                    gpa,
-                    expression_context,
-                    context.cfa.?,
-                ) orelse return error.NoExpressionValue;
-                const addr = switch (value) {
-                    .generic => |addr| addr,
-                    else => return error.InvalidExpressionValue,
-                };
-                const ptr: *usize = @ptrFromInt(addr);
-                mem.writeInt(usize, out[0..@sizeOf(usize)], ptr.*, native_endian);
-                return false;
-            },
-            .val_expression => |expression| {
-                context.stack_machine.reset();
-                const value = try context.stack_machine.run(
-                    expression,
-                    gpa,
-                    expression_context,
-                    context.cfa.?,
-                ) orelse return error.NoExpressionValue;
-                const val_raw = switch (value) {
-                    .generic => |raw| raw,
-                    else => return error.InvalidExpressionValue,
-                };
-                mem.writeInt(usize, out[0..@sizeOf(usize)], val_raw, native_endian);
-                return false;
-            },
-            .architectural => return error.UnimplementedRegisterRule,
-        }
-    }
-
     /// Unwind a stack frame using DWARF unwinding info, updating the register context.
     ///
     /// If `.eh_frame_hdr` is available and complete, it will be used to binary search for the FDE.
     /// Otherwise, a linear scan of `.eh_frame` and `.debug_frame` is done to find the FDE. The latter
     /// may require lazily loading the data in those sections.
     ///
-    /// `explicit_fde_offset` is for cases where the FDE offset is known, such as when __unwind_info
+    /// `explicit_fde_offset` is for cases where the FDE offset is known, such as when using macOS'
+    /// `__unwind_info` section.
     pub fn unwindFrame(
         context: *DwarfUnwindContext,
+        cache: *Cache,
         gpa: Allocator,
         unwind: *const Dwarf.Unwind,
         load_offset: usize,
         explicit_fde_offset: ?usize,
     ) Error!usize {
-        return unwindFrameInner(context, gpa, unwind, load_offset, explicit_fde_offset) catch |err| switch (err) {
-            error.InvalidDebugInfo, error.MissingDebugInfo, error.OutOfMemory => |e| return e,
+        return unwindFrameInner(context, cache, gpa, unwind, load_offset, explicit_fde_offset) catch |err| switch (err) {
+            error.InvalidDebugInfo,
+            error.MissingDebugInfo,
+            error.UnsupportedDebugInfo,
+            error.OutOfMemory,
+            => |e| return e,
 
-            error.UnimplementedRegisterRule,
             error.UnsupportedAddrSize,
-            error.UnsupportedDwarfVersion,
             error.UnimplementedUserOpcode,
             error.UnimplementedExpressionCall,
             error.UnimplementedOpcode,
@@ -394,12 +328,12 @@ pub const DwarfUnwindContext = struct {
             error.InvalidExpressionValue,
             error.NoExpressionValue,
             error.RegisterSizeMismatch,
-            error.InvalidCFA,
             => return error.InvalidDebugInfo,
         };
     }
     fn unwindFrameInner(
         context: *DwarfUnwindContext,
+        cache: *Cache,
         gpa: Allocator,
         unwind: *const Dwarf.Unwind,
         load_offset: usize,
@@ -411,57 +345,85 @@ pub const DwarfUnwindContext = struct {
 
         const pc_vaddr = context.pc - load_offset;
 
-        const fde_offset = explicit_fde_offset orelse try unwind.lookupPc(
-            pc_vaddr,
-            @sizeOf(usize),
-            native_endian,
-        ) orelse return error.MissingDebugInfo;
-        const format, const cie, const fde = try unwind.getFde(fde_offset, @sizeOf(usize), native_endian);
+        const cache_slot: Cache.Slot = slot: {
+            const slot_idx = std.hash.int(pc_vaddr) % Cache.num_slots;
 
-        // Check if the FDE *actually* includes the pc (`lookupPc` can return false positives).
-        if (pc_vaddr < fde.pc_begin or pc_vaddr >= fde.pc_begin + fde.pc_range) {
-            return error.MissingDebugInfo;
-        }
+            {
+                cache.mutex.lock();
+                defer cache.mutex.unlock();
+                if (cache.buf[slot_idx].pc == pc_vaddr) break :slot cache.buf[slot_idx];
+            }
+
+            const fde_offset = explicit_fde_offset orelse try unwind.lookupPc(
+                pc_vaddr,
+                @sizeOf(usize),
+                native_endian,
+            ) orelse return error.MissingDebugInfo;
+            const cie, const fde = try unwind.getFde(fde_offset, native_endian);
 
-        // Do not set `compile_unit` because the spec states that CFIs
-        // may not reference other debug sections anyway.
-        var expression_context: Dwarf.expression.Context = .{
-            .format = format,
-            .cpu_context = &context.cpu_context,
-            .cfa = context.cfa,
+            // Check if the FDE *actually* includes the pc (`lookupPc` can return false positives).
+            if (pc_vaddr < fde.pc_begin or pc_vaddr >= fde.pc_begin + fde.pc_range) {
+                return error.MissingDebugInfo;
+            }
+
+            context.vm.reset();
+
+            const row = try context.vm.runTo(gpa, pc_vaddr, cie, &fde, @sizeOf(usize), native_endian);
+
+            if (row.columns.len > Cache.Slot.max_regs) return error.UnsupportedDebugInfo;
+
+            var slot: Cache.Slot = .{
+                .pc = pc_vaddr,
+                .cie = cie,
+                .cfa_rule = row.cfa,
+                .rules_regs = undefined,
+                .rules = undefined,
+                .num_rules = 0,
+            };
+            for (context.vm.rowColumns(&row)) |col| {
+                const i = slot.num_rules;
+                slot.rules_regs[i] = col.register;
+                slot.rules[i] = col.rule;
+                slot.num_rules += 1;
+            }
+
+            {
+                cache.mutex.lock();
+                defer cache.mutex.unlock();
+                cache.buf[slot_idx] = slot;
+            }
+
+            break :slot slot;
         };
 
-        context.vm.reset();
+        const format = cache_slot.cie.format;
+        const return_address_register = cache_slot.cie.return_address_register;
 
-        const row = try context.vm.runTo(gpa, pc_vaddr, cie, fde, @sizeOf(usize), native_endian);
-        context.cfa = switch (row.cfa.rule) {
-            .val_offset => |offset| blk: {
-                const register = row.cfa.register orelse return error.InvalidCFARule;
-                const value = (try regNative(&context.cpu_context, register)).*;
-                break :blk try applyOffset(value, offset);
+        context.cfa = switch (cache_slot.cfa_rule) {
+            .none => return error.InvalidCFARule,
+            .reg_off => |ro| cfa: {
+                const ptr = try regNative(&context.cpu_context, ro.register);
+                break :cfa try applyOffset(ptr.*, ro.offset);
             },
-            .expression => |expr| blk: {
+            .expression => |expr| cfa: {
                 context.stack_machine.reset();
-                const value = try context.stack_machine.run(
-                    expr,
-                    gpa,
-                    expression_context,
-                    context.cfa,
-                );
-
-                if (value) |v| {
-                    if (v != .generic) return error.InvalidExpressionValue;
-                    break :blk v.generic;
-                } else return error.NoExpressionValue;
+                const value = try context.stack_machine.run(expr, gpa, .{
+                    .format = format,
+                    .cpu_context = &context.cpu_context,
+                }, context.cfa) orelse return error.NoExpressionValue;
+                switch (value) {
+                    .generic => |g| break :cfa g,
+                    else => return error.InvalidExpressionValue,
+                }
             },
-            else => return error.InvalidCFARule,
         };
 
-        expression_context.cfa = context.cfa;
-
-        // If the rule for the return address register is 'undefined', that indicates there is no
-        // return address, i.e. this is the end of the stack.
-        var explicit_has_return_address: ?bool = null;
+        // If unspecified, we'll use the default rule for the return address register, which is
+        // typically equivalent to `.undefined` (meaning there is no return address), but may be
+        // overriden by ABIs.
+        var has_return_address: bool = builtin.cpu.arch.isAARCH64() and
+            return_address_register >= 19 and
+            return_address_register <= 28;
 
         // Create a copy of the CPU context, to which we will apply the new rules.
         var new_cpu_context = context.cpu_context;
@@ -469,25 +431,78 @@ pub const DwarfUnwindContext = struct {
         // On all implemented architectures, the CFA is defined as being the previous frame's SP
         (try regNative(&new_cpu_context, sp_reg_num)).* = context.cfa.?;
 
-        for (context.vm.rowColumns(row)) |column| {
-            if (column.register) |register| {
-                const dest = try new_cpu_context.dwarfRegisterBytes(register);
-                const rule_undef = try context.resolveRegisterRule(gpa, column, expression_context, dest);
-                if (register == cie.return_address_register) {
-                    explicit_has_return_address = !rule_undef;
-                }
+        const rules_len = cache_slot.num_rules;
+        for (cache_slot.rules_regs[0..rules_len], cache_slot.rules[0..rules_len]) |register, rule| {
+            const new_val: union(enum) {
+                same,
+                undefined,
+                val: usize,
+                bytes: []const u8,
+            } = switch (rule) {
+                .default => val: {
+                    // The default rule is typically equivalent to `.undefined`, but ABIs may override it.
+                    if (builtin.cpu.arch.isAARCH64() and register >= 19 and register <= 28) {
+                        break :val .same;
+                    }
+                    break :val .undefined;
+                },
+                .undefined => .undefined,
+                .same_value => .same,
+                .offset => |offset| val: {
+                    const ptr: *const usize = @ptrFromInt(try applyOffset(context.cfa.?, offset));
+                    break :val .{ .val = ptr.* };
+                },
+                .val_offset => |offset| .{ .val = try applyOffset(context.cfa.?, offset) },
+                .register => |r| .{ .bytes = try context.cpu_context.dwarfRegisterBytes(r) },
+                .expression => |expr| val: {
+                    context.stack_machine.reset();
+                    const value = try context.stack_machine.run(expr, gpa, .{
+                        .format = format,
+                        .cpu_context = &context.cpu_context,
+                    }, context.cfa.?) orelse return error.NoExpressionValue;
+                    const ptr: *const usize = switch (value) {
+                        .generic => |addr| @ptrFromInt(addr),
+                        else => return error.InvalidExpressionValue,
+                    };
+                    break :val .{ .val = ptr.* };
+                },
+                .val_expression => |expr| val: {
+                    context.stack_machine.reset();
+                    const value = try context.stack_machine.run(expr, gpa, .{
+                        .format = format,
+                        .cpu_context = &context.cpu_context,
+                    }, context.cfa.?) orelse return error.NoExpressionValue;
+                    switch (value) {
+                        .generic => |val| break :val .{ .val = val },
+                        else => return error.InvalidExpressionValue,
+                    }
+                },
+            };
+            switch (new_val) {
+                .same => {},
+                .undefined => {
+                    const dest = try new_cpu_context.dwarfRegisterBytes(@intCast(register));
+                    @memset(dest, undefined);
+                },
+                .val => |val| {
+                    const dest = try new_cpu_context.dwarfRegisterBytes(@intCast(register));
+                    if (dest.len != @sizeOf(usize)) return error.RegisterSizeMismatch;
+                    const dest_ptr: *align(1) usize = @ptrCast(dest);
+                    dest_ptr.* = val;
+                },
+                .bytes => |src| {
+                    const dest = try new_cpu_context.dwarfRegisterBytes(@intCast(register));
+                    if (dest.len != src.len) return error.RegisterSizeMismatch;
+                    @memcpy(dest, src);
+                },
+            }
+            if (register == return_address_register) {
+                has_return_address = new_val != .undefined;
             }
         }
 
-        // If the return address register did not have an explicitly specified rules then it uses
-        // the default rule, which is usually equivalent to '.undefined', i.e. end-of-stack.
-        const has_return_address = explicit_has_return_address orelse switch (defaultRuleBehavior(cie.return_address_register)) {
-            .undefined => false,
-            .same_value => return error.InvalidDebugInfo, // this doesn't make sense, we would get stuck in an infinite loop
-        };
-
         const return_address: usize = if (has_return_address) pc: {
-            const raw_ptr = try regNative(&new_cpu_context, cie.return_address_register);
+            const raw_ptr = try regNative(&new_cpu_context, return_address_register);
             break :pc stripInstructionPtrAuthCode(raw_ptr.*);
         } else 0;
 
@@ -501,7 +516,7 @@ pub const DwarfUnwindContext = struct {
         // "return address" we have is the instruction which triggered the signal (if the signal
         // handler returned, the instruction would be re-run). Compensate for this by incrementing
         // the address in that case.
-        const adjusted_ret_addr = if (cie.is_signal_frame) return_address +| 1 else return_address;
+        const adjusted_ret_addr = if (cache_slot.cie.is_signal_frame) return_address +| 1 else return_address;
 
         // We also want to do that same subtraction here to get the PC for the next frame's FDE.
         // This is because if the callee was noreturn, then the function call might be the caller's

@@ -572,9 +572,12 @@ pub fn captureCurrentStackTrace(options: StackUnwindOptions, addr_buf: []usize)
     defer it.deinit();
     if (!it.stratOk(options.allow_unsafe_unwind)) return empty_trace;
     var total_frames: usize = 0;
-    var frame_idx: usize = 0;
+    var index: usize = 0;
     var wait_for = options.first_address;
-    while (true) switch (it.next()) {
+    // Ideally, we would iterate the whole stack so that the `index` in the returned trace was
+    // indicative of how many frames were skipped. However, this has a significant runtime cost
+    // in some cases, so at least for now, we don't do that.
+    while (index < addr_buf.len) switch (it.next()) {
         .switch_to_fp => if (!it.stratOk(options.allow_unsafe_unwind)) break,
         .end => break,
         .frame => |ret_addr| {
@@ -588,13 +591,13 @@ pub fn captureCurrentStackTrace(options: StackUnwindOptions, addr_buf: []usize)
                 if (ret_addr != target) continue;
                 wait_for = null;
             }
-            if (frame_idx < addr_buf.len) addr_buf[frame_idx] = ret_addr;
-            frame_idx += 1;
+            addr_buf[index] = ret_addr;
+            index += 1;
         },
     };
     return .{
-        .index = frame_idx,
-        .instruction_addresses = addr_buf[0..@min(frame_idx, addr_buf.len)],
+        .index = index,
+        .instruction_addresses = addr_buf[0..index],
     };
 }
 /// Write the current stack trace to `writer`, annotated with source locations.

Commit 156cd8f678

Commit `156cd8f678`