Commit bdab4f53c1

Jakub Konka <kubkon@jakubkonka.com>
2020-10-07 19:36:50
Move trie structure into its own file-module
Signed-off-by: Jakub Konka <kubkon@jakubkonka.com>
1 parent b13b36a
Changed files (2)
src
src/link/MachO/Trie.zig
@@ -0,0 +1,259 @@
+/// Represents export trie used in MachO executables and dynamic libraries.
+/// The purpose of an export trie is to encode as compactly as possible all
+/// export symbols for the loader `dyld`.
+/// The export trie encodes offset and other information using ULEB128
+/// encoding, and is part of the __LINKEDIT segment.
+///
+/// Description from loader.h:
+///
+/// The symbols exported by a dylib are encoded in a trie. This is a compact
+/// representation that factors out common prefixes. It also reduces LINKEDIT pages
+/// in RAM because it encodes all information (name, address, flags) in one small,
+/// contiguous range. The export area is a stream of nodes. The first node sequentially
+/// is the start node for the trie.
+///
+/// Nodes for a symbol start with a uleb128 that is the length of the exported symbol
+/// information for the string so far. If there is no exported symbol, the node starts
+/// with a zero byte. If there is exported info, it follows the length.
+///
+/// First is a uleb128 containing flags. Normally, it is followed by a uleb128 encoded
+/// offset which is location of the content named by the symbol from the mach_header
+/// for the image. If the flags is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags
+/// is a uleb128 encoded library ordinal, then a zero terminated UTF8 string. If the string
+/// is zero length, then the symbol is re-export from the specified dylib with the same name.
+/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following the flags is two
+/// uleb128s: the stub offset and the resolver offset. The stub is used by non-lazy pointers.
+/// The resolver is used by lazy pointers and must be called to get the actual address to use.
+///
+/// After the optional exported symbol information is a byte of how many edges (0-255) that
+/// this node has leaving it, followed by each edge. Each edge is a zero terminated UTF8 of
+/// the addition chars in the symbol, followed by a uleb128 offset for the node that edge points to.
+const Trie = @This();
+
+const std = @import("std");
+const mem = std.mem;
+const leb = std.debug.leb;
+const log = std.log.scoped(.link);
+const Allocator = mem.Allocator;
+
+pub const Symbol = struct {
+    name: []const u8,
+    offset: u64,
+    export_flags: u64,
+};
+
+const Edge = struct {
+    from: *Node,
+    to: *Node,
+    label: []const u8,
+
+    fn deinit(self: *Edge, alloc: *Allocator) void {
+        self.to.deinit(alloc);
+        alloc.destroy(self.to);
+        self.from = undefined;
+        self.to = undefined;
+    }
+};
+
+const Node = struct {
+    export_flags: ?u64 = null,
+    offset: ?u64 = null,
+    edges: std.ArrayListUnmanaged(Edge) = .{},
+
+    fn deinit(self: *Node, alloc: *Allocator) void {
+        for (self.edges.items) |*edge| {
+            edge.deinit(alloc);
+        }
+        self.edges.deinit(alloc);
+    }
+
+    fn put(self: *Node, alloc: *Allocator, fromEdge: ?*Edge, prefix: usize, label: []const u8) !*Node {
+        // Traverse all edges.
+        for (self.edges.items) |*edge| {
+            const match = mem.indexOfDiff(u8, edge.label, label) orelse return self; // Got a full match, don't do anything.
+            if (match - prefix > 0) {
+                // If we match, we advance further down the trie.
+                return edge.to.put(alloc, edge, match, label);
+            }
+        }
+
+        if (fromEdge) |from| {
+            if (mem.eql(u8, from.label, label[0..prefix])) {
+                if (prefix == label.len) return self;
+            } else {
+                // Fixup nodes. We need to insert an intermediate node between
+                // from.to and self.
+                // Is: A -> B
+                // Should be: A -> C -> B
+                const mid = try alloc.create(Node);
+                mid.* = .{};
+                const to_label = from.label;
+                from.to = mid;
+                from.label = label[0..prefix];
+
+                try mid.edges.append(alloc, .{
+                    .from = mid,
+                    .to = self,
+                    .label = to_label,
+                });
+
+                if (prefix == label.len) return self; // We're done.
+
+                const new_node = try alloc.create(Node);
+                new_node.* = .{};
+
+                try mid.edges.append(alloc, .{
+                    .from = mid,
+                    .to = new_node,
+                    .label = label,
+                });
+
+                return new_node;
+            }
+        }
+
+        // Add a new edge.
+        const node = try alloc.create(Node);
+        node.* = .{};
+
+        try self.edges.append(alloc, .{
+            .from = self,
+            .to = node,
+            .label = label,
+        });
+
+        return node;
+    }
+
+    fn writeULEB128Mem(self: Node, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) Trie.WriteError!void {
+        if (self.offset) |offset| {
+            // Terminal node info: encode export flags and vmaddr offset of this symbol.
+            var info_buf_len: usize = 0;
+            var info_buf: [@sizeOf(u64) * 2]u8 = undefined;
+            info_buf_len += try leb.writeULEB128Mem(info_buf[0..], self.export_flags.?);
+            info_buf_len += try leb.writeULEB128Mem(info_buf[info_buf_len..], offset);
+
+            // Encode the size of the terminal node info.
+            var size_buf: [@sizeOf(u64)]u8 = undefined;
+            const size_buf_len = try leb.writeULEB128Mem(size_buf[0..], info_buf_len);
+
+            // Now, write them to the output buffer.
+            try buffer.ensureCapacity(alloc, buffer.items.len + info_buf_len + size_buf_len);
+            buffer.appendSliceAssumeCapacity(size_buf[0..size_buf_len]);
+            buffer.appendSliceAssumeCapacity(info_buf[0..info_buf_len]);
+        } else {
+            // Non-terminal node is delimited by 0 byte.
+            try buffer.append(alloc, 0);
+        }
+        // Write number of edges (max legal number of edges is 256).
+        try buffer.append(alloc, @intCast(u8, self.edges.items.len));
+
+        var node_offset_info: [@sizeOf(u8)]u64 = undefined;
+        for (self.edges.items) |edge, i| {
+            // Write edges labels leaving out space in-between to later populate
+            // with offsets to each node.
+            try buffer.ensureCapacity(alloc, buffer.items.len + edge.label.len + 1 + @sizeOf(u64)); // +1 to account for null-byte
+            buffer.appendSliceAssumeCapacity(edge.label);
+            buffer.appendAssumeCapacity(0);
+            node_offset_info[i] = buffer.items.len;
+            const padding = [_]u8{0} ** @sizeOf(u64);
+            buffer.appendSliceAssumeCapacity(padding[0..]);
+        }
+
+        for (self.edges.items) |edge, i| {
+            const offset = buffer.items.len;
+            try edge.to.writeULEB128Mem(alloc, buffer);
+            // We can now populate the offset to the node pointed by this edge.
+            // TODO this is not the approach taken by `ld64` which does several iterations
+            // to close the gap between the space encoding the offset to the node pointed
+            // by this edge. However, it seems that as long as we are contiguous, the padding
+            // introduced here should not influence the performance of `dyld`. I'm leaving
+            // this TODO here though as a reminder to re-investigate in the future and especially
+            // when we start working on dylibs in case `dyld` refuses to cooperate and/or the
+            // performance is noticably sufferring.
+            // Link to official impl: https://opensource.apple.com/source/ld64/ld64-123.2.1/src/abstraction/MachOTrie.hpp
+            var offset_buf: [@sizeOf(u64)]u8 = undefined;
+            const offset_buf_len = try leb.writeULEB128Mem(offset_buf[0..], offset);
+            mem.copy(u8, buffer.items[node_offset_info[i]..], offset_buf[0..offset_buf_len]);
+        }
+    }
+};
+
+root: Node,
+
+/// Insert a symbol into the trie, updating the prefixes in the process.
+/// This operation may change the layout of the trie by splicing edges in
+/// certain circumstances.
+pub fn put(self: *Trie, alloc: *Allocator, symbol: Symbol) !void {
+    const node = try self.root.put(alloc, null, 0, symbol.name);
+    node.offset = symbol.offset;
+    node.export_flags = symbol.export_flags;
+}
+
+pub const WriteError = error{ OutOfMemory, NoSpaceLeft };
+
+/// Write the trie to a buffer ULEB128 encoded.
+pub fn writeULEB128Mem(self: Trie, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) WriteError!void {
+    return self.root.writeULEB128Mem(alloc, buffer);
+}
+
+pub fn deinit(self: *Trie, alloc: *Allocator) void {
+    self.root.deinit(alloc);
+}
+
+test "Trie basic" {
+    const testing = @import("std").testing;
+    var gpa = testing.allocator;
+
+    var trie: Trie = .{
+        .root = .{},
+    };
+    defer trie.deinit(gpa);
+
+    // root
+    testing.expect(trie.root.edges.items.len == 0);
+
+    // root --- _st ---> node
+    try trie.put(gpa, .{
+        .name = "_st",
+        .offset = 0,
+        .export_flags = 0,
+    });
+    testing.expect(trie.root.edges.items.len == 1);
+    testing.expect(mem.eql(u8, trie.root.edges.items[0].label, "_st"));
+
+    {
+        // root --- _st ---> node --- _start ---> node
+        try trie.put(gpa, .{
+            .name = "_start",
+            .offset = 0,
+            .export_flags = 0,
+        });
+        testing.expect(trie.root.edges.items.len == 1);
+
+        const nextEdge = &trie.root.edges.items[0];
+        testing.expect(mem.eql(u8, nextEdge.label, "_st"));
+        testing.expect(nextEdge.to.edges.items.len == 1);
+        testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_start"));
+    }
+    {
+        // root --- _ ---> node --- _st ---> node --- _start ---> node
+        //                  |
+        //                  |   --- _main ---> node
+        try trie.put(gpa, .{
+            .name = "_main",
+            .offset = 0,
+            .export_flags = 0,
+        });
+        testing.expect(trie.root.edges.items.len == 1);
+
+        const nextEdge = &trie.root.edges.items[0];
+        testing.expect(mem.eql(u8, nextEdge.label, "_"));
+        testing.expect(nextEdge.to.edges.items.len == 2);
+        testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_st"));
+        testing.expect(mem.eql(u8, nextEdge.to.edges.items[1].label, "_main"));
+
+        const nextNextEdge = &nextEdge.to.edges.items[0];
+        testing.expect(mem.eql(u8, nextNextEdge.to.edges.items[0].label, "_start"));
+    }
+}
src/link/MachO.zig
@@ -20,6 +20,8 @@ const File = link.File;
 const Cache = @import("../Cache.zig");
 const target_util = @import("../target.zig");
 
+const Trie = @import("MachO/Trie.zig");
+
 pub const base_tag: File.Tag = File.Tag.macho;
 
 const LoadCommand = union(enum) {
@@ -64,156 +66,6 @@ const LoadCommand = union(enum) {
     }
 };
 
-/// Represents export trie used in MachO executables and dynamic libraries.
-/// The purpose of an export trie is to encode as compactly as possible all
-/// export symbols for the loader `dyld`.
-/// The export trie encodes offset and other information using ULEB128
-/// encoding, and is part of the __LINKEDIT segment.
-const Trie = struct {
-    const Node = struct {
-        const Edge = struct {
-            from: *Node,
-            to: *Node,
-            label: []const u8,
-
-            pub fn deinit(self: *Edge, alloc: *Allocator) void {
-                self.to.deinit(alloc);
-                alloc.destroy(self.to);
-                self.from = undefined;
-                self.to = undefined;
-            }
-        };
-
-        export_flags: ?u64 = null,
-        offset: ?u64 = null,
-        edges: std.ArrayListUnmanaged(Edge) = .{},
-
-        pub fn deinit(self: *Node, alloc: *Allocator) void {
-            for (self.edges.items) |*edge| {
-                edge.deinit(alloc);
-            }
-            self.edges.deinit(alloc);
-        }
-
-        pub fn put(self: *Node, alloc: *Allocator, fromEdge: ?*Edge, prefix: usize, label: []const u8) !*Node {
-            // Traverse all edges.
-            for (self.edges.items) |*edge| {
-                const match = mem.indexOfDiff(u8, edge.label, label) orelse return self; // Got a full match, don't do anything.
-                if (match - prefix > 0) {
-                    // If we match, we advance further down the trie.
-                    return edge.to.put(alloc, edge, match, label);
-                }
-            }
-
-            if (fromEdge) |from| {
-                if (mem.eql(u8, from.label, label[0..prefix])) {
-                    if (prefix == label.len) return self;
-                } else {
-                    // Fixup nodes. We need to insert an intermediate node between
-                    // from.to and self.
-                    const mid = try alloc.create(Node);
-                    mid.* = .{};
-                    const to_label = from.label;
-                    from.to = mid;
-                    from.label = label[0..prefix];
-
-                    try mid.edges.append(alloc, .{
-                        .from = mid,
-                        .to = self,
-                        .label = to_label,
-                    });
-
-                    if (prefix == label.len) return self; // We're done.
-
-                    const new_node = try alloc.create(Node);
-                    new_node.* = .{};
-
-                    try mid.edges.append(alloc, .{
-                        .from = mid,
-                        .to = new_node,
-                        .label = label,
-                    });
-
-                    return new_node;
-                }
-            }
-
-            // Add a new edge.
-            const node = try alloc.create(Node);
-            node.* = .{};
-
-            try self.edges.append(alloc, .{
-                .from = self,
-                .to = node,
-                .label = label,
-            });
-
-            return node;
-        }
-
-        pub fn write(self: Node, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) Trie.WriteError!void {
-            if (self.offset) |off| {
-                // Terminal node info: encode export flags and vmaddr offset of this symbol.
-                var info_buf_len: usize = 0;
-                var info_buf: [@sizeOf(u64) * 2]u8 = undefined;
-                info_buf_len += try std.debug.leb.writeULEB128Mem(info_buf[0..], self.export_flags.?);
-                info_buf_len += try std.debug.leb.writeULEB128Mem(info_buf[info_buf_len..], off);
-
-                // Encode the size of the terminal node info.
-                var size_buf: [@sizeOf(u64)]u8 = undefined;
-                const size_buf_len = try std.debug.leb.writeULEB128Mem(size_buf[0..], info_buf_len);
-
-                // Now, write them to the output buffer.
-                try buffer.ensureCapacity(alloc, buffer.items.len + info_buf_len + size_buf_len);
-                buffer.appendSliceAssumeCapacity(size_buf[0..size_buf_len]);
-                buffer.appendSliceAssumeCapacity(info_buf[0..info_buf_len]);
-            } else {
-                // Non-terminal node is delimited by 0 byte.
-                try buffer.append(alloc, 0);
-            }
-            // Write number of edges (max legal number of edges is 256).
-            try buffer.append(alloc, @intCast(u8, self.edges.items.len));
-
-            var node_offset_info: [@sizeOf(u8)]u64 = undefined;
-            for (self.edges.items) |edge, i| {
-                // Write edges labels leaving out space in-between to later populate
-                // with offsets to each node.
-                try buffer.ensureCapacity(alloc, buffer.items.len + edge.label.len + 1 + @sizeOf(u64)); // +1 to account for null-byte
-                buffer.appendSliceAssumeCapacity(edge.label);
-                buffer.appendAssumeCapacity(0);
-                node_offset_info[i] = buffer.items.len;
-                const padding = [_]u8{0} ** @sizeOf(u64);
-                buffer.appendSliceAssumeCapacity(padding[0..]);
-            }
-
-            for (self.edges.items) |edge, i| {
-                const offset = buffer.items.len;
-                try edge.to.write(alloc, buffer);
-                // We can now populate the offset to the node pointed by this edge.
-                var offset_buf: [@sizeOf(u64)]u8 = undefined;
-                const offset_buf_len = try std.debug.leb.writeULEB128Mem(offset_buf[0..], offset);
-                mem.copy(u8, buffer.items[node_offset_info[i]..], offset_buf[0..offset_buf_len]);
-            }
-        }
-    };
-
-    root: Node,
-
-    pub fn put(self: *Trie, alloc: *Allocator, word: []const u8) !*Node {
-        return self.root.put(alloc, null, 0, word);
-    }
-
-    pub const WriteError = error{ OutOfMemory, NoSpaceLeft };
-
-    pub fn write(self: Trie, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) WriteError!void {
-        return self.root.write(alloc, buffer);
-    }
-
-    pub fn deinit(self: *Trie, alloc: *Allocator) void {
-        self.root.deinit(alloc);
-    }
-};
-
 base: File,
 
 /// Table of all load commands
@@ -1541,19 +1393,21 @@ fn writeExportTrie(self: *MachO) !void {
     defer trie.deinit(self.base.allocator);
 
     const text_segment = self.load_commands.items[self.text_segment_cmd_index.?].Segment;
-
     for (self.global_symbols.items) |symbol| {
         // TODO figure out if we should put all global symbols into the export trie
         const name = self.getString(symbol.n_strx);
-        const node = try trie.put(self.base.allocator, name);
-        node.offset = symbol.n_value - text_segment.vmaddr;
-        node.export_flags = 0; // TODO workout creation of export flags
+        assert(symbol.n_value >= text_segment.vmaddr);
+        try trie.put(self.base.allocator, .{
+            .name = name,
+            .offset = symbol.n_value - text_segment.vmaddr,
+            .export_flags = 0, // TODO workout creation of export flags
+        });
     }
 
     var buffer: std.ArrayListUnmanaged(u8) = .{};
     defer buffer.deinit(self.base.allocator);
 
-    try trie.write(self.base.allocator, &buffer);
+    try trie.writeULEB128Mem(self.base.allocator, &buffer);
 
     const dyld_info = &self.load_commands.items[self.dyld_info_cmd_index.?].DyldInfo;
     try self.base.file.?.pwriteAll(buffer.items, dyld_info.export_off);
@@ -1688,48 +1542,3 @@ fn satMul(a: anytype, b: anytype) @TypeOf(a, b) {
     const T = @TypeOf(a, b);
     return std.math.mul(T, a, b) catch std.math.maxInt(T);
 }
-
-test "Trie basic" {
-    const testing = @import("std").testing;
-    var gpa = testing.allocator;
-
-    var trie: Trie = .{
-        .root = .{},
-    };
-    defer trie.deinit(gpa);
-
-    // root
-    testing.expect(trie.root.edges.items.len == 0);
-
-    // root --- _st ---> node
-    try trie.put(gpa, "_st");
-    testing.expect(trie.root.edges.items.len == 1);
-    testing.expect(mem.eql(u8, trie.root.edges.items[0].label, "_st"));
-
-    {
-        // root --- _st ---> node --- _start ---> node
-        try trie.put(gpa, "_start");
-        testing.expect(trie.root.edges.items.len == 1);
-
-        const nextEdge = &trie.root.edges.items[0];
-        testing.expect(mem.eql(u8, nextEdge.label, "_st"));
-        testing.expect(nextEdge.to.edges.items.len == 1);
-        testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_start"));
-    }
-    {
-        // root --- _ ---> node --- _st ---> node --- _start ---> node
-        //                  |
-        //                  |   --- _main ---> node
-        try trie.put(gpa, "_main");
-        testing.expect(trie.root.edges.items.len == 1);
-
-        const nextEdge = &trie.root.edges.items[0];
-        testing.expect(mem.eql(u8, nextEdge.label, "_"));
-        testing.expect(nextEdge.to.edges.items.len == 2);
-        testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_st"));
-        testing.expect(mem.eql(u8, nextEdge.to.edges.items[1].label, "_main"));
-
-        const nextNextEdge = &nextEdge.to.edges.items[0];
-        testing.expect(mem.eql(u8, nextNextEdge.to.edges.items[0].label, "_start"));
-    }
-}