Commit aaf54ce6a7

Andrew Kelley <andrew@ziglang.org>
2024-11-01 08:34:10
link.File.Wasm.Archive: simplify
Don't use the reader interface Avoid unnecessary heap allocations At first I started working on incorporating the Archive fields into the Wasm data model, however, I realized a better strategy: simply omit Archive data from the serialized linker state. Those files can be trivially reparsed on next compiler process start. If they haven't changed, great. Otherwise if they have, the prelink phase needs to be restarted anyway.
1 parent d30e287
Changed files (2)
src
src/link/Wasm/Archive.zig
@@ -2,25 +2,25 @@
 /// This is stored as a single slice of bytes, as the header-names
 /// point to the character index of a file name, rather than the index
 /// in the list.
-long_file_names: []const u8,
+/// Points into `file_contents`.
+long_file_names: RelativeSlice,
 
 /// Parsed table of contents.
 /// Each symbol name points to a list of all definition
 /// sites within the current static archive.
 toc: Toc,
 
+/// Key points into `LazyArchive` `file_contents`.
+/// Value is allocated with gpa.
 const Toc = std.StringArrayHashMapUnmanaged(std.ArrayListUnmanaged(u32));
 
-// Archive files start with the ARMAG identifying string.  Then follows a
-// `struct Header', and as many bytes of member file data as its `size'
-// member indicates, for each member file.
-/// String that begins an archive file.
-const ARMAG: *const [SARMAG:0]u8 = "!<arch>\n";
-/// Size of that string.
-const SARMAG: u4 = 8;
+const ARMAG = std.elf.ARMAG;
+const ARFMAG = std.elf.ARFMAG;
 
-/// String in fmag at the end of each header.
-const ARFMAG: *const [2:0]u8 = "`\n";
+const RelativeSlice = struct {
+    off: u32,
+    len: u32,
+};
 
 const Header = extern struct {
     /// Member file name, sometimes / terminated.
@@ -70,130 +70,106 @@ const Header = extern struct {
 
 pub fn deinit(archive: *Archive, gpa: Allocator) void {
     deinitToc(gpa, &archive.toc);
-    gpa.free(archive.long_file_names);
     archive.* = undefined;
 }
 
 fn deinitToc(gpa: Allocator, toc: *Toc) void {
-    for (toc.keys()) |key| gpa.free(key);
     for (toc.values()) |*value| value.deinit(gpa);
     toc.deinit(gpa);
 }
 
 pub fn parse(gpa: Allocator, file_contents: []const u8) !Archive {
-    var fbs = std.io.fixedBufferStream(file_contents);
-    const reader = fbs.reader();
+    var pos: usize = 0;
 
-    const magic = try reader.readBytesNoEof(SARMAG);
-    if (!mem.eql(u8, &magic, ARMAG)) return error.BadArchiveMagic;
+    if (!mem.eql(u8, file_contents[0..ARMAG.len], ARMAG)) return error.BadArchiveMagic;
+    pos += ARMAG.len;
 
-    const header = try reader.readStruct(Header);
+    const header = mem.bytesAsValue(Header, file_contents[pos..][0..@sizeOf(Header)]);
     if (!mem.eql(u8, &header.fmag, ARFMAG)) return error.BadHeaderDelimiter;
+    pos += @sizeOf(Header);
 
-    var toc = try parseTableOfContents(gpa, header, reader);
-    errdefer deinitToc(gpa, &toc);
-
-    const long_file_names = try parseNameTable(gpa, reader);
-    errdefer gpa.free(long_file_names);
-
-    return .{
-        .toc = toc,
-        .long_file_names = long_file_names,
-    };
-}
-
-fn parseName(archive: *const Archive, header: Header) ![]const u8 {
-    const name_or_index = try header.nameOrIndex();
-    switch (name_or_index) {
-        .name => |name| return name,
-        .index => |index| {
-            const name = mem.sliceTo(archive.long_file_names[index..], 0x0a);
-            return mem.trimRight(u8, name, "/");
-        },
-    }
-}
-
-fn parseTableOfContents(gpa: Allocator, header: Header, reader: anytype) !Toc {
-    // size field can have extra spaces padded in front as well as the end,
-    // so we trim those first before parsing the ASCII value.
+    // The size field can have extra spaces padded in front as well as
+    // the end, so we trim those first before parsing the ASCII value.
     const size_trimmed = mem.trim(u8, &header.size, " ");
     const sym_tab_size = try std.fmt.parseInt(u32, size_trimmed, 10);
 
-    const num_symbols = try reader.readInt(u32, .big);
-    const symbol_positions = try gpa.alloc(u32, num_symbols);
-    defer gpa.free(symbol_positions);
-    for (symbol_positions) |*index| {
-        index.* = try reader.readInt(u32, .big);
-    }
+    const num_symbols = mem.readInt(u32, file_contents[pos..][0..4], .big);
+    pos += 4;
 
-    const sym_tab = try gpa.alloc(u8, sym_tab_size - 4 - (4 * num_symbols));
-    defer gpa.free(sym_tab);
+    const symbol_positions_size = @sizeOf(u32) * num_symbols;
+    const symbol_positions_be = mem.bytesAsSlice(u32, file_contents[pos..][0..symbol_positions_size]);
+    pos += symbol_positions_size;
 
-    reader.readNoEof(sym_tab) catch return error.IncompleteSymbolTable;
+    const sym_tab = file_contents[pos..][0 .. sym_tab_size - 4 - symbol_positions_size];
+    pos += sym_tab.len;
 
     var toc: Toc = .empty;
     errdefer deinitToc(gpa, &toc);
 
-    var i: usize = 0;
-    var pos: usize = 0;
-    while (i < num_symbols) : (i += 1) {
-        const string = mem.sliceTo(sym_tab[pos..], 0);
-        pos += string.len + 1;
-        if (string.len == 0) continue;
+    var sym_tab_pos: usize = 0;
+    for (0..num_symbols) |i| {
+        const name = mem.sliceTo(sym_tab[sym_tab_pos..], 0);
+        sym_tab_pos += name.len + 1;
+        if (name.len == 0) continue;
 
-        const name = try gpa.dupe(u8, string);
-        errdefer gpa.free(name);
         const gop = try toc.getOrPut(gpa, name);
-        if (gop.found_existing) {
-            gpa.free(name);
-        } else {
-            gop.value_ptr.* = .{};
-        }
-        try gop.value_ptr.append(gpa, symbol_positions[i]);
+        if (!gop.found_existing) gop.value_ptr.* = .empty;
+        try gop.value_ptr.append(gpa, switch (native_endian) {
+            .big => symbol_positions_be[i],
+            .little => @byteSwap(symbol_positions_be[i]),
+        });
     }
 
-    return toc;
-}
+    const long_file_names: RelativeSlice = s: {
+        const sub_header = mem.bytesAsValue(Header, file_contents[pos..][0..@sizeOf(Header)]);
+        pos += @sizeOf(Header);
 
-fn parseNameTable(gpa: Allocator, reader: anytype) ![]const u8 {
-    const header: Header = try reader.readStruct(Header);
-    if (!mem.eql(u8, &header.fmag, ARFMAG)) {
-        return error.InvalidHeaderDelimiter;
-    }
-    if (!mem.eql(u8, header.name[0..2], "//")) {
-        return error.MissingTableName;
-    }
-    const table_size = try header.parsedSize();
-    const long_file_names = try gpa.alloc(u8, table_size);
-    errdefer gpa.free(long_file_names);
-    try reader.readNoEof(long_file_names);
+        if (!mem.eql(u8, &header.fmag, ARFMAG)) return error.BadHeaderDelimiter;
+        if (!mem.eql(u8, sub_header.name[0..2], "//")) return error.MissingTableName;
+        const table_size = try sub_header.parsedSize();
+
+        break :s .{
+            .off = @intCast(pos),
+            .len = table_size,
+        };
+    };
 
-    return long_file_names;
+    return .{
+        .toc = toc,
+        .long_file_names = long_file_names,
+    };
 }
 
 /// From a given file offset, starts reading for a file header.
 /// When found, parses the object file into an `Object` and returns it.
 pub fn parseObject(archive: Archive, wasm: *Wasm, file_contents: []const u8, path: Path) !Object {
-    var fbs = std.io.fixedBufferStream(file_contents);
-    const header = try fbs.reader().readStruct(Header);
+    const header = mem.bytesAsValue(Header, file_contents[0..@sizeOf(Header)]);
+    if (!mem.eql(u8, &header.fmag, ARFMAG)) return error.BadHeaderDelimiter;
 
-    if (!mem.eql(u8, &header.fmag, ARFMAG)) return error.BadArchiveHeaderDelimiter;
+    const name_or_index = try header.nameOrIndex();
+    const object_name = switch (name_or_index) {
+        .name => |name| name,
+        .index => |index| n: {
+            const long_file_names = file_contents[archive.long_file_names.off..][0..archive.long_file_names.len];
+            const name = mem.sliceTo(long_file_names[index..], 0x0a);
+            break :n mem.trimRight(u8, name, "/");
+        },
+    };
 
-    const object_name = try archive.parseName(header);
     const object_file_size = try header.parsedSize();
 
     return Object.create(wasm, file_contents[@sizeOf(Header)..][0..object_file_size], path, object_name);
 }
 
+const Archive = @This();
+
+const builtin = @import("builtin");
+const native_endian = builtin.cpu.arch.endian();
+
 const std = @import("std");
-const assert = std.debug.assert;
-const fs = std.fs;
-const log = std.log.scoped(.archive);
 const mem = std.mem;
+const Allocator = std.mem.Allocator;
 const Path = std.Build.Cache.Path;
 
-const Allocator = mem.Allocator;
-const Object = @import("Object.zig");
 const Wasm = @import("../Wasm.zig");
-
-const Archive = @This();
+const Object = @import("Object.zig");
src/link/Wasm.zig
@@ -1,39 +1,40 @@
 const Wasm = @This();
+const build_options = @import("build_options");
 
-const std = @import("std");
+const builtin = @import("builtin");
+const native_endian = builtin.cpu.arch.endian();
 
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const Cache = std.Build.Cache;
+const Path = Cache.Path;
 const assert = std.debug.assert;
-const build_options = @import("build_options");
-const builtin = @import("builtin");
-const codegen = @import("../codegen.zig");
-const dev = @import("../dev.zig");
 const fs = std.fs;
+const gc_log = std.log.scoped(.gc);
 const leb = std.leb;
-const link = @import("../link.zig");
-const lldMain = @import("../main.zig").lldMain;
 const log = std.log.scoped(.link);
-const gc_log = std.log.scoped(.gc);
 const mem = std.mem;
-const trace = @import("../tracy.zig").trace;
-const wasi_libc = @import("../wasi_libc.zig");
 
 const Air = @import("../Air.zig");
-const Allocator = std.mem.Allocator;
 const Archive = @import("Wasm/Archive.zig");
-const Cache = std.Build.Cache;
-const Path = Cache.Path;
 const CodeGen = @import("../arch/wasm/CodeGen.zig");
 const Compilation = @import("../Compilation.zig");
 const Dwarf = @import("Dwarf.zig");
 const InternPool = @import("../InternPool.zig");
 const Liveness = @import("../Liveness.zig");
 const LlvmObject = @import("../codegen/llvm.zig").Object;
-const Zcu = @import("../Zcu.zig");
 const Object = @import("Wasm/Object.zig");
 const Symbol = @import("Wasm/Symbol.zig");
 const Type = @import("../Type.zig");
 const Value = @import("../Value.zig");
+const Zcu = @import("../Zcu.zig");
 const ZigObject = @import("Wasm/ZigObject.zig");
+const codegen = @import("../codegen.zig");
+const dev = @import("../dev.zig");
+const link = @import("../link.zig");
+const lldMain = @import("../main.zig").lldMain;
+const trace = @import("../tracy.zig").trace;
+const wasi_libc = @import("../wasi_libc.zig");
 
 base: link.File,
 /// Null-terminated strings, indexes have type String and string_table provides
@@ -141,6 +142,9 @@ function_table: std.AutoHashMapUnmanaged(SymbolLoc, u32) = .empty,
 
 /// All archive files that are lazy loaded.
 /// e.g. when an undefined symbol references a symbol from the archive.
+/// None of this data is serialized to disk because it is trivially reloaded
+/// from unchanged archive files on the next start of the compiler process,
+/// or if those files have changed, the prelink phase needs to be restarted.
 lazy_archives: std.ArrayListUnmanaged(LazyArchive) = .empty,
 
 /// A map of global names to their symbol location
@@ -283,12 +287,15 @@ pub const OptionalObjectId = enum(u16) {
     }
 };
 
+/// None of this data is serialized since it can be re-loaded from disk, or if
+/// it has been changed, the data must be discarded.
 const LazyArchive = struct {
     path: Path,
     file_contents: []const u8,
     archive: Archive,
 
     fn deinit(la: *LazyArchive, gpa: Allocator) void {
+        la.archive.deinit(gpa);
         gpa.free(la.path.sub_path);
         gpa.free(la.file_contents);
         la.* = undefined;