Commit 2f4bbd6c63

Andrew Kelley <andrew@ziglang.org>
2024-03-22 03:53:24
std.Build.Cache: use an array hash map for files
Rather than an ArrayList. Provides deduplication.
1 parent ebec733
Changed files (4)
lib/std/Build/Cache.zig
@@ -55,7 +55,15 @@ pub fn prefixes(cache: *const Cache) []const Directory {
 
 const PrefixedPath = struct {
     prefix: u8,
-    sub_path: []u8,
+    sub_path: []const u8,
+
+    fn eql(a: PrefixedPath, b: PrefixedPath) bool {
+        return a.prefix == b.prefix and std.mem.eql(u8, a.sub_path, b.sub_path);
+    }
+
+    fn hash(pp: PrefixedPath) u32 {
+        return @truncate(std.hash.Wyhash.hash(pp.prefix, pp.sub_path));
+    }
 };
 
 fn findPrefix(cache: *const Cache, file_path: []const u8) !PrefixedPath {
@@ -132,7 +140,7 @@ pub const hasher_init: Hasher = Hasher.init(&[_]u8{
 });
 
 pub const File = struct {
-    prefixed_path: ?PrefixedPath,
+    prefixed_path: PrefixedPath,
     max_file_size: ?usize,
     stat: Stat,
     bin_digest: BinDigest,
@@ -145,16 +153,18 @@ pub const File = struct {
     };
 
     pub fn deinit(self: *File, gpa: Allocator) void {
-        if (self.prefixed_path) |pp| {
-            gpa.free(pp.sub_path);
-            self.prefixed_path = null;
-        }
+        gpa.free(self.prefixed_path.sub_path);
         if (self.contents) |contents| {
             gpa.free(contents);
             self.contents = null;
         }
         self.* = undefined;
     }
+
+    pub fn updateMaxSize(file: *File, new_max_size: ?usize) void {
+        const new = new_max_size orelse return;
+        file.max_file_size = if (file.max_file_size) |old| @max(old, new) else new;
+    }
 };
 
 pub const HashHelper = struct {
@@ -296,7 +306,7 @@ pub const Manifest = struct {
     // order to obtain a problematic timestamp for the next call. Calls after that
     // will then use the same timestamp, to avoid unnecessary filesystem writes.
     want_refresh_timestamp: bool = true,
-    files: std.ArrayListUnmanaged(File) = .{},
+    files: Files = .{},
     hex_digest: HexDigest,
     /// Populated when hit() returns an error because of one
     /// of the files listed in the manifest.
@@ -305,6 +315,34 @@ pub const Manifest = struct {
     /// what time the file system thinks it is, according to its own granularity.
     recent_problematic_timestamp: i128 = 0,
 
+    pub const Files = std.ArrayHashMapUnmanaged(File, void, FilesContext, false);
+
+    pub const FilesContext = struct {
+        pub fn hash(fc: FilesContext, file: File) u32 {
+            _ = fc;
+            return file.prefixed_path.hash();
+        }
+
+        pub fn eql(fc: FilesContext, a: File, b: File, b_index: usize) bool {
+            _ = fc;
+            _ = b_index;
+            return a.prefixed_path.eql(b.prefixed_path);
+        }
+    };
+
+    const FilesAdapter = struct {
+        pub fn eql(context: @This(), a: PrefixedPath, b: File, b_index: usize) bool {
+            _ = context;
+            _ = b_index;
+            return a.eql(b.prefixed_path);
+        }
+
+        pub fn hash(context: @This(), key: PrefixedPath) u32 {
+            _ = context;
+            return key.hash();
+        }
+    };
+
     /// Add a file as a dependency of process being cached. When `hit` is
     /// called, the file's contents will be checked to ensure that it matches
     /// the contents from previous times.
@@ -317,7 +355,7 @@ pub const Manifest = struct {
     /// to access the contents of the file after calling `hit()` like so:
     ///
     /// ```
-    /// var file_contents = cache_hash.files.items[file_index].contents.?;
+    /// var file_contents = cache_hash.files.keys()[file_index].contents.?;
     /// ```
     pub fn addFile(self: *Manifest, file_path: []const u8, max_file_size: ?usize) !usize {
         assert(self.manifest_file == null);
@@ -327,7 +365,12 @@ pub const Manifest = struct {
         const prefixed_path = try self.cache.findPrefix(file_path);
         errdefer gpa.free(prefixed_path.sub_path);
 
-        self.files.addOneAssumeCapacity().* = .{
+        const gop = self.files.getOrPutAssumeCapacityAdapted(prefixed_path, FilesAdapter{});
+        if (gop.found_existing) {
+            gop.key_ptr.updateMaxSize(max_file_size);
+            return gop.index;
+        }
+        gop.key_ptr.* = .{
             .prefixed_path = prefixed_path,
             .contents = null,
             .max_file_size = max_file_size,
@@ -338,7 +381,7 @@ pub const Manifest = struct {
         self.hash.add(prefixed_path.prefix);
         self.hash.addBytes(prefixed_path.sub_path);
 
-        return self.files.items.len - 1;
+        return gop.index;
     }
 
     pub fn addOptionalFile(self: *Manifest, optional_file_path: ?[]const u8) !void {
@@ -418,7 +461,7 @@ pub const Manifest = struct {
 
         self.want_refresh_timestamp = true;
 
-        const input_file_count = self.files.items.len;
+        const input_file_count = self.files.entries.len;
         while (true) : (self.unhit(bin_digest, input_file_count)) {
             const file_contents = try self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max);
             defer gpa.free(file_contents);
@@ -430,7 +473,7 @@ pub const Manifest = struct {
                 if (try self.upgradeToExclusiveLock()) continue;
                 self.manifest_dirty = true;
                 while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.items[idx];
+                    const ch_file = &self.files.keys()[idx];
                     self.populateFileHash(ch_file) catch |err| {
                         self.failed_file_index = idx;
                         return err;
@@ -441,18 +484,6 @@ pub const Manifest = struct {
             while (line_iter.next()) |line| {
                 defer idx += 1;
 
-                const cache_hash_file = if (idx < input_file_count) &self.files.items[idx] else blk: {
-                    const new = try self.files.addOne(gpa);
-                    new.* = .{
-                        .prefixed_path = null,
-                        .contents = null,
-                        .max_file_size = null,
-                        .stat = undefined,
-                        .bin_digest = undefined,
-                    };
-                    break :blk new;
-                };
-
                 var iter = mem.tokenizeScalar(u8, line, ' ');
                 const size = iter.next() orelse return error.InvalidFormat;
                 const inode = iter.next() orelse return error.InvalidFormat;
@@ -461,30 +492,61 @@ pub const Manifest = struct {
                 const prefix_str = iter.next() orelse return error.InvalidFormat;
                 const file_path = iter.rest();
 
-                cache_hash_file.stat.size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
-                cache_hash_file.stat.inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
-                cache_hash_file.stat.mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
-                _ = fmt.hexToBytes(&cache_hash_file.bin_digest, digest_str) catch return error.InvalidFormat;
+                const stat_size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
+                const stat_inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
+                const stat_mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
+                const file_bin_digest = b: {
+                    if (digest_str.len != hex_digest_len) return error.InvalidFormat;
+                    var bd: BinDigest = undefined;
+                    _ = fmt.hexToBytes(&bd, digest_str) catch return error.InvalidFormat;
+                    break :b bd;
+                };
+
                 const prefix = fmt.parseInt(u8, prefix_str, 10) catch return error.InvalidFormat;
                 if (prefix >= self.cache.prefixes_len) return error.InvalidFormat;
 
-                if (file_path.len == 0) {
-                    return error.InvalidFormat;
-                }
-                if (cache_hash_file.prefixed_path) |pp| {
-                    if (pp.prefix != prefix or !mem.eql(u8, file_path, pp.sub_path)) {
-                        return error.InvalidFormat;
-                    }
-                }
+                if (file_path.len == 0) return error.InvalidFormat;
 
-                if (cache_hash_file.prefixed_path == null) {
-                    cache_hash_file.prefixed_path = .{
+                const cache_hash_file = f: {
+                    const prefixed_path: PrefixedPath = .{
                         .prefix = prefix,
-                        .sub_path = try gpa.dupe(u8, file_path),
+                        .sub_path = file_path, // expires with file_contents
                     };
-                }
+                    if (idx < input_file_count) {
+                        const file = &self.files.keys()[idx];
+                        if (!file.prefixed_path.eql(prefixed_path))
+                            return error.InvalidFormat;
+
+                        file.stat = .{
+                            .size = stat_size,
+                            .inode = stat_inode,
+                            .mtime = stat_mtime,
+                        };
+                        file.bin_digest = file_bin_digest;
+                        break :f file;
+                    }
+                    const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+                    errdefer assert(self.files.popOrNull() != null);
+                    if (!gop.found_existing) {
+                        gop.key_ptr.* = .{
+                            .prefixed_path = .{
+                                .prefix = prefix,
+                                .sub_path = try gpa.dupe(u8, file_path),
+                            },
+                            .contents = null,
+                            .max_file_size = null,
+                            .stat = .{
+                                .size = stat_size,
+                                .inode = stat_inode,
+                                .mtime = stat_mtime,
+                            },
+                            .bin_digest = file_bin_digest,
+                        };
+                    }
+                    break :f gop.key_ptr;
+                };
 
-                const pp = cache_hash_file.prefixed_path.?;
+                const pp = cache_hash_file.prefixed_path;
                 const dir = self.cache.prefixes()[pp.prefix].handle;
                 const this_file = dir.openFile(pp.sub_path, .{ .mode = .read_only }) catch |err| switch (err) {
                     error.FileNotFound => {
@@ -548,7 +610,7 @@ pub const Manifest = struct {
                 if (try self.upgradeToExclusiveLock()) continue;
                 self.manifest_dirty = true;
                 while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.items[idx];
+                    const ch_file = &self.files.keys()[idx];
                     self.populateFileHash(ch_file) catch |err| {
                         self.failed_file_index = idx;
                         return err;
@@ -571,12 +633,12 @@ pub const Manifest = struct {
         self.hash.hasher.update(&bin_digest);
 
         // Remove files not in the initial hash.
-        for (self.files.items[input_file_count..]) |*file| {
+        for (self.files.keys()[input_file_count..]) |*file| {
             file.deinit(self.cache.gpa);
         }
         self.files.shrinkRetainingCapacity(input_file_count);
 
-        for (self.files.items) |file| {
+        for (self.files.keys()) |file| {
             self.hash.hasher.update(&file.bin_digest);
         }
     }
@@ -616,7 +678,7 @@ pub const Manifest = struct {
     }
 
     fn populateFileHash(self: *Manifest, ch_file: *File) !void {
-        const pp = ch_file.prefixed_path.?;
+        const pp = ch_file.prefixed_path;
         const dir = self.cache.prefixes()[pp.prefix].handle;
         const file = try dir.openFile(pp.sub_path, .{});
         defer file.close();
@@ -682,7 +744,7 @@ pub const Manifest = struct {
             .bin_digest = undefined,
             .contents = null,
         };
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
+        errdefer self.files.shrinkRetainingCapacity(self.files.entries.len - 1);
 
         try self.populateFileHash(new_ch_file);
 
@@ -690,9 +752,11 @@ pub const Manifest = struct {
     }
 
     /// Add a file as a dependency of process being cached, after the initial hash has been
-    /// calculated. This is useful for processes that don't know the all the files that
-    /// are depended on ahead of time. For example, a source file that can import other files
-    /// will need to be recompiled if the imported file is changed.
+    /// calculated.
+    ///
+    /// This is useful for processes that don't know the all the files that are
+    /// depended on ahead of time. For example, a source file that can import
+    /// other files will need to be recompiled if the imported file is changed.
     pub fn addFilePost(self: *Manifest, file_path: []const u8) !void {
         assert(self.manifest_file != null);
 
@@ -700,17 +764,26 @@ pub const Manifest = struct {
         const prefixed_path = try self.cache.findPrefix(file_path);
         errdefer gpa.free(prefixed_path.sub_path);
 
-        const new_ch_file = try self.files.addOne(gpa);
-        new_ch_file.* = .{
+        const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+        errdefer assert(self.files.popOrNull() != null);
+
+        if (gop.found_existing) {
+            gpa.free(prefixed_path.sub_path);
+            return;
+        }
+
+        gop.key_ptr.* = .{
             .prefixed_path = prefixed_path,
             .max_file_size = null,
             .stat = undefined,
             .bin_digest = undefined,
             .contents = null,
         };
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
 
-        try self.populateFileHash(new_ch_file);
+        self.files.lockPointers();
+        defer self.files.unlockPointers();
+
+        try self.populateFileHash(gop.key_ptr);
     }
 
     /// Like `addFilePost` but when the file contents have already been loaded from disk.
@@ -724,13 +797,20 @@ pub const Manifest = struct {
         assert(self.manifest_file != null);
         const gpa = self.cache.gpa;
 
-        const ch_file = try self.files.addOne(gpa);
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
-
         const prefixed_path = try self.cache.findPrefixResolved(resolved_path);
         errdefer gpa.free(prefixed_path.sub_path);
 
-        ch_file.* = .{
+        const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+        errdefer assert(self.files.popOrNull() != null);
+
+        if (gop.found_existing) {
+            gpa.free(prefixed_path.sub_path);
+            return;
+        }
+
+        const new_file = gop.key_ptr;
+
+        new_file.* = .{
             .prefixed_path = prefixed_path,
             .max_file_size = null,
             .stat = stat,
@@ -738,19 +818,19 @@ pub const Manifest = struct {
             .contents = null,
         };
 
-        if (self.isProblematicTimestamp(ch_file.stat.mtime)) {
+        if (self.isProblematicTimestamp(new_file.stat.mtime)) {
             // The actual file has an unreliable timestamp, force it to be hashed
-            ch_file.stat.mtime = 0;
-            ch_file.stat.inode = 0;
+            new_file.stat.mtime = 0;
+            new_file.stat.inode = 0;
         }
 
         {
             var hasher = hasher_init;
             hasher.update(bytes);
-            hasher.final(&ch_file.bin_digest);
+            hasher.final(&new_file.bin_digest);
         }
 
-        self.hash.hasher.update(&ch_file.bin_digest);
+        self.hash.hasher.update(&new_file.bin_digest);
     }
 
     pub fn addDepFilePost(self: *Manifest, dir: fs.Dir, dep_file_basename: []const u8) !void {
@@ -816,14 +896,14 @@ pub const Manifest = struct {
 
             const writer = contents.writer();
             try writer.writeAll(manifest_header ++ "\n");
-            for (self.files.items) |file| {
+            for (self.files.keys()) |file| {
                 try writer.print("{d} {d} {d} {} {d} {s}\n", .{
                     file.stat.size,
                     file.stat.inode,
                     file.stat.mtime,
                     fmt.fmtSliceHexLower(&file.bin_digest),
-                    file.prefixed_path.?.prefix,
-                    file.prefixed_path.?.sub_path,
+                    file.prefixed_path.prefix,
+                    file.prefixed_path.sub_path,
                 });
             }
 
@@ -892,7 +972,7 @@ pub const Manifest = struct {
 
             file.close();
         }
-        for (self.files.items) |*file| {
+        for (self.files.keys()) |*file| {
             file.deinit(self.cache.gpa);
         }
         self.files.deinit(self.cache.gpa);
@@ -1061,7 +1141,7 @@ test "check that changing a file makes cache fail" {
             // There should be nothing in the cache
             try testing.expectEqual(false, try ch.hit());
 
-            try testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.items[temp_file_idx].contents.?));
+            try testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.keys()[temp_file_idx].contents.?));
 
             digest1 = ch.final();
 
@@ -1081,7 +1161,7 @@ test "check that changing a file makes cache fail" {
             try testing.expectEqual(false, try ch.hit());
 
             // The cache system does not keep the contents of re-hashed input files.
-            try testing.expect(ch.files.items[temp_file_idx].contents == null);
+            try testing.expect(ch.files.keys()[temp_file_idx].contents == null);
 
             digest2 = ch.final();
 
lib/std/Build/Step.zig
@@ -544,7 +544,7 @@ pub fn cacheHit(s: *Step, man: *std.Build.Cache.Manifest) !bool {
 
 fn failWithCacheError(s: *Step, man: *const std.Build.Cache.Manifest, err: anyerror) anyerror {
     const i = man.failed_file_index orelse return err;
-    const pp = man.files.items[i].prefixed_path orelse return err;
+    const pp = man.files.keys()[i].prefixed_path;
     const prefix = man.cache.prefixes()[pp.prefix].path orelse "";
     return s.fail("{s}: {s}/{s}", .{ @errorName(err), prefix, pp.sub_path });
 }
src/Compilation.zig
@@ -1999,7 +1999,7 @@ pub fn update(comp: *Compilation, main_progress_node: *std.Progress.Node) !void
 
             const is_hit = man.hit() catch |err| {
                 const i = man.failed_file_index orelse return err;
-                const pp = man.files.items[i].prefixed_path orelse return err;
+                const pp = man.files.keys()[i].prefixed_path;
                 const prefix = man.cache.prefixes()[pp.prefix];
                 return comp.setMiscFailure(
                     .check_whole_cache,
@@ -4147,7 +4147,7 @@ pub fn cImport(comp: *Compilation, c_src: []const u8, owner_mod: *Package.Module
     const prev_hash_state = man.hash.peekBin();
     const actual_hit = hit: {
         _ = try man.hit();
-        if (man.files.items.len == 0) {
+        if (man.files.entries.len == 0) {
             man.unhit(prev_hash_state, 0);
             break :hit false;
         }
src/glibc.zig
@@ -713,7 +713,7 @@ pub fn buildSharedObjects(comp: *Compilation, prog_node: *std.Progress.Node) !vo
     };
     defer o_directory.handle.close();
 
-    const abilists_contents = man.files.items[abilists_index].contents.?;
+    const abilists_contents = man.files.keys()[abilists_index].contents.?;
     const metadata = try loadMetaData(comp.gpa, abilists_contents);
     defer metadata.destroy(comp.gpa);