Commit 3783b1b23c

mlugg <mlugg@mlugg.co.uk>
2025-04-24 02:17:32
std.Build.Cache: fix several bugs
Aside from adding comments to document the logic in `Cache.Manifest.hit` better, this commit fixes two serious bugs. The first, spotted by Andrew, is that when upgrading from a shared to an exclusive lock on the manifest file, we do not seek it back to the start. This is a simple fix. The second is more subtle, and has to do with the computation of file digests. Broadly speaking, the goal of the main loop in `hit` is to iterate the files listed in the manifest file, and check if they've changed, based on stat and a file hash. While doing this, the `bin_digest` field of `std.Build.Cache.File`, which is initially `undefined`, is populated for all files, either straight from the manifest (if the stat matches) or recomputed from the file on-disk. This file digest is then used to update `man.hash.hasher`, which is building the final hash used as, for instance, the output directory name when the compiler emits into the cache directory. When `hit` returns a cache miss, it is expected that `man.hash.hasher` includes the digests of all "initial files"; that is, those which have been already added with e.g. `addFilePath`, but not those which will later be added with `addFilePost` (even though the manifest file has told us about some such files). Previously, `hit` was using the `unhit` function to do this in a few cases. However, this is incorrect, because `hit` assumes that all files already have their `bin_digest` field populated; this function is only valid to call *after* `hit` returns. Instead, we need to actually compute the hashes which haven't yet been populated. Even if this logic has been working, there was still a bug here, because we called `unhit` when upgrading from a shared to an exclusive lock, writing the (potentially `undefined`) file digests, but the loop itself writes the file digests *again*! All in all, the hashing logic here was actually incredibly broken. I've taken the opportunity to restructure this section of the code into what I think is a more readable format. A new function, `hitWithCurrentLock`, uses the open manifest file to try and find a cache hit. It returns a tagged union which, in the miss case, tells the caller (`hit`) how many files already have their hash populated. This avoids redundant work recomputing the same hash multiple times in situations where the lock needs upgrading. This also eliminates the outer loop from `hit`, which was a little confusing because it iterated no more than twice! The bugs fixed here could manifest in several different ways depending on how contended file locks were satisfied. Most notably, on a cache miss, the Zig compiler might have written the compilation output to the incorrect directory (because it incorrectly constructed a hash using `undefined` or repeated file digests), resulting in all future hits on this manifest causing `error.FileNotFound`. This is #23110. I have been able to reproduce #23110 on `master`, and have not been able to after this commit, so I am relatively sure this commit resolves that issue. Resolves: #23110
1 parent 5668c8b
Changed files (3)
lib/std/Build/Cache.zig
@@ -337,6 +337,7 @@ pub const Manifest = struct {
         manifest_create: fs.File.OpenError,
         manifest_read: fs.File.ReadError,
         manifest_lock: fs.File.LockError,
+        manifest_seek: fs.File.SeekError,
         file_open: FileOp,
         file_stat: FileOp,
         file_read: FileOp,
@@ -488,7 +489,6 @@ pub const Manifest = struct {
     /// option, one may call `toOwnedLock` to obtain a smaller object which can represent
     /// the lock. `deinit` is safe to call whether or not `toOwnedLock` has been called.
     pub fn hit(self: *Manifest) HitError!bool {
-        const gpa = self.cache.gpa;
         assert(self.manifest_file == null);
 
         self.diagnostic = .none;
@@ -501,12 +501,12 @@ pub const Manifest = struct {
 
         self.hex_digest = binToHex(bin_digest);
 
-        self.hash.hasher = hasher_init;
-        self.hash.hasher.update(&bin_digest);
-
         @memcpy(manifest_file_path[0..self.hex_digest.len], &self.hex_digest);
         manifest_file_path[hex_digest_len..][0..ext.len].* = ext.*;
 
+        // We'll try to open the cache with an exclusive lock, but if that would block
+        // and `want_shared_lock` is set, a shared lock might be sufficient, so we'll
+        // open with a shared lock instead.
         while (true) {
             if (self.cache.manifest_dir.createFile(&manifest_file_path, .{
                 .read = true,
@@ -575,26 +575,71 @@ pub const Manifest = struct {
         self.want_refresh_timestamp = true;
 
         const input_file_count = self.files.entries.len;
-        while (true) : (self.unhit(bin_digest, input_file_count)) {
-            const file_contents = self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max) catch |err| switch (err) {
-                error.OutOfMemory => return error.OutOfMemory,
-                error.StreamTooLong => return error.OutOfMemory,
-                else => |e| {
-                    self.diagnostic = .{ .manifest_read = e };
+
+        // We're going to construct a second hash. Its input will begin with the digest we've
+        // already computed (`bin_digest`), and then it'll have the digests of each input file,
+        // including "post" files (see `addFilePost`). If this is a hit, we learn the set of "post"
+        // files from the manifest on disk. If this is a miss, we'll learn those from future calls
+        // to `addFilePost` etc. As such, the state of `self.hash.hasher` after this function
+        // depends on whether this is a hit or a miss.
+        //
+        // If we return `true` indicating a cache hit, then `self.hash.hasher` must already include
+        // the digests of the "post" files, so the caller can call `final`. Otherwise, on a cache
+        // miss, `self.hash.hasher` will include the digests of all non-"post" files -- that is,
+        // the ones we've already been told about. The rest will be discovered through calls to
+        // `addFilePost` etc, which will update the hasher. After all files are added, the user can
+        // use `final`, and will at some point `writeManifest` the file list to disk.
+
+        self.hash.hasher = hasher_init;
+        self.hash.hasher.update(&bin_digest);
+
+        hit: {
+            const file_digests_populated: usize = digests: {
+                switch (try self.hitWithCurrentLock()) {
+                    .hit => break :hit,
+                    .miss => |m| if (!try self.upgradeToExclusiveLock()) {
+                        break :digests m.file_digests_populated;
+                    },
+                }
+                // We've just had a miss with the shared lock, and upgraded to an exclusive lock. Someone
+                // else might have modified the digest, so we need to check again before deciding to miss.
+                // Before trying again, we must reset `self.hash.hasher` and `self.files`.
+                // This is basically just the first half of `unhit`.
+                self.hash.hasher = hasher_init;
+                self.hash.hasher.update(&bin_digest);
+                while (self.files.count() != input_file_count) {
+                    var file = self.files.pop().?;
+                    file.key.deinit(self.cache.gpa);
+                }
+                // Also, seek the file back to the start.
+                self.manifest_file.?.seekTo(0) catch |err| {
+                    self.diagnostic = .{ .manifest_seek = err };
                     return error.CacheCheckFailed;
-                },
+                };
+
+                switch (try self.hitWithCurrentLock()) {
+                    .hit => break :hit,
+                    .miss => |m| break :digests m.file_digests_populated,
+                }
             };
-            defer gpa.free(file_contents);
-
-            var any_file_changed = false;
-            var line_iter = mem.tokenizeScalar(u8, file_contents, '\n');
-            var idx: usize = 0;
-            if (if (line_iter.next()) |line| !std.mem.eql(u8, line, manifest_header) else true) {
-                if (try self.upgradeToExclusiveLock()) continue;
-                self.manifest_dirty = true;
-                while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.keys()[idx];
-                    self.populateFileHash(ch_file) catch |err| {
+
+            // This is a guaranteed cache miss. We're almost ready to return `false`, but there's a
+            // little bookkeeping to do first. The first `file_digests_populated` entries in `files`
+            // have their `bin_digest` populated; there may be some left in `input_file_count` which
+            // we'll need to populate ourselves. Other than that, this is basically `unhit`.
+            self.manifest_dirty = true;
+            self.hash.hasher = hasher_init;
+            self.hash.hasher.update(&bin_digest);
+            while (self.files.count() != input_file_count) {
+                var file = self.files.pop().?;
+                file.key.deinit(self.cache.gpa);
+            }
+            for (self.files.keys(), 0..) |*file, idx| {
+                if (idx < file_digests_populated) {
+                    // `bin_digest` is already populated by `hitWithCurrentLock`, so we can use it directly.
+                    self.hash.hasher.update(&file.bin_digest);
+                } else {
+                    self.populateFileHash(file) catch |err| {
                         self.diagnostic = .{ .file_hash = .{
                             .file_index = idx,
                             .err = err,
@@ -602,172 +647,195 @@ pub const Manifest = struct {
                         return error.CacheCheckFailed;
                     };
                 }
-                return false;
             }
-            while (line_iter.next()) |line| {
-                defer idx += 1;
-
-                var iter = mem.tokenizeScalar(u8, line, ' ');
-                const size = iter.next() orelse return error.InvalidFormat;
-                const inode = iter.next() orelse return error.InvalidFormat;
-                const mtime_nsec_str = iter.next() orelse return error.InvalidFormat;
-                const digest_str = iter.next() orelse return error.InvalidFormat;
-                const prefix_str = iter.next() orelse return error.InvalidFormat;
-                const file_path = iter.rest();
-
-                const stat_size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
-                const stat_inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
-                const stat_mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
-                const file_bin_digest = b: {
-                    if (digest_str.len != hex_digest_len) return error.InvalidFormat;
-                    var bd: BinDigest = undefined;
-                    _ = fmt.hexToBytes(&bd, digest_str) catch return error.InvalidFormat;
-                    break :b bd;
-                };
+            return false;
+        }
+
+        if (self.want_shared_lock) {
+            self.downgradeToSharedLock() catch |err| {
+                self.diagnostic = .{ .manifest_lock = err };
+                return error.CacheCheckFailed;
+            };
+        }
 
-                const prefix = fmt.parseInt(u8, prefix_str, 10) catch return error.InvalidFormat;
-                if (prefix >= self.cache.prefixes_len) return error.InvalidFormat;
+        return true;
+    }
 
-                if (file_path.len == 0) return error.InvalidFormat;
+    /// Assumes that `self.hash.hasher` has been updated only with the original digest, that
+    /// `self.files` contains only the original input files, and that `self.manifest_file.?` is
+    /// seeked to the start of the file.
+    fn hitWithCurrentLock(self: *Manifest) HitError!union(enum) {
+        hit,
+        miss: struct {
+            file_digests_populated: usize,
+        },
+    } {
+        const gpa = self.cache.gpa;
+        const input_file_count = self.files.entries.len;
 
-                const cache_hash_file = f: {
-                    const prefixed_path: PrefixedPath = .{
-                        .prefix = prefix,
-                        .sub_path = file_path, // expires with file_contents
-                    };
-                    if (idx < input_file_count) {
-                        const file = &self.files.keys()[idx];
-                        if (!file.prefixed_path.eql(prefixed_path))
-                            return error.InvalidFormat;
+        const file_contents = self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max) catch |err| switch (err) {
+            error.OutOfMemory => return error.OutOfMemory,
+            error.StreamTooLong => return error.OutOfMemory,
+            else => |e| {
+                self.diagnostic = .{ .manifest_read = e };
+                return error.CacheCheckFailed;
+            },
+        };
+        defer gpa.free(file_contents);
+
+        var any_file_changed = false;
+        var line_iter = mem.tokenizeScalar(u8, file_contents, '\n');
+        var idx: usize = 0;
+        const header_valid = valid: {
+            const line = line_iter.next() orelse break :valid false;
+            break :valid std.mem.eql(u8, line, manifest_header);
+        };
+        if (!header_valid) {
+            return .{ .miss = .{ .file_digests_populated = 0 } };
+        }
+        while (line_iter.next()) |line| {
+            defer idx += 1;
+
+            var iter = mem.tokenizeScalar(u8, line, ' ');
+            const size = iter.next() orelse return error.InvalidFormat;
+            const inode = iter.next() orelse return error.InvalidFormat;
+            const mtime_nsec_str = iter.next() orelse return error.InvalidFormat;
+            const digest_str = iter.next() orelse return error.InvalidFormat;
+            const prefix_str = iter.next() orelse return error.InvalidFormat;
+            const file_path = iter.rest();
+
+            const stat_size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
+            const stat_inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
+            const stat_mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
+            const file_bin_digest = b: {
+                if (digest_str.len != hex_digest_len) return error.InvalidFormat;
+                var bd: BinDigest = undefined;
+                _ = fmt.hexToBytes(&bd, digest_str) catch return error.InvalidFormat;
+                break :b bd;
+            };
+
+            const prefix = fmt.parseInt(u8, prefix_str, 10) catch return error.InvalidFormat;
+            if (prefix >= self.cache.prefixes_len) return error.InvalidFormat;
+
+            if (file_path.len == 0) return error.InvalidFormat;
 
-                        file.stat = .{
+            const cache_hash_file = f: {
+                const prefixed_path: PrefixedPath = .{
+                    .prefix = prefix,
+                    .sub_path = file_path, // expires with file_contents
+                };
+                if (idx < input_file_count) {
+                    const file = &self.files.keys()[idx];
+                    if (!file.prefixed_path.eql(prefixed_path))
+                        return error.InvalidFormat;
+
+                    file.stat = .{
+                        .size = stat_size,
+                        .inode = stat_inode,
+                        .mtime = stat_mtime,
+                    };
+                    file.bin_digest = file_bin_digest;
+                    break :f file;
+                }
+                const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
+                errdefer _ = self.files.pop();
+                if (!gop.found_existing) {
+                    gop.key_ptr.* = .{
+                        .prefixed_path = .{
+                            .prefix = prefix,
+                            .sub_path = try gpa.dupe(u8, file_path),
+                        },
+                        .contents = null,
+                        .max_file_size = null,
+                        .handle = null,
+                        .stat = .{
                             .size = stat_size,
                             .inode = stat_inode,
                             .mtime = stat_mtime,
-                        };
-                        file.bin_digest = file_bin_digest;
-                        break :f file;
-                    }
-                    const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
-                    errdefer _ = self.files.pop();
-                    if (!gop.found_existing) {
-                        gop.key_ptr.* = .{
-                            .prefixed_path = .{
-                                .prefix = prefix,
-                                .sub_path = try gpa.dupe(u8, file_path),
-                            },
-                            .contents = null,
-                            .max_file_size = null,
-                            .handle = null,
-                            .stat = .{
-                                .size = stat_size,
-                                .inode = stat_inode,
-                                .mtime = stat_mtime,
-                            },
-                            .bin_digest = file_bin_digest,
-                        };
-                    }
-                    break :f gop.key_ptr;
-                };
-
-                const pp = cache_hash_file.prefixed_path;
-                const dir = self.cache.prefixes()[pp.prefix].handle;
-                const this_file = dir.openFile(pp.sub_path, .{ .mode = .read_only }) catch |err| switch (err) {
-                    error.FileNotFound => {
-                        if (try self.upgradeToExclusiveLock()) continue;
-                        return false;
-                    },
-                    else => |e| {
-                        self.diagnostic = .{ .file_open = .{
-                            .file_index = idx,
-                            .err = e,
-                        } };
-                        return error.CacheCheckFailed;
-                    },
-                };
-                defer this_file.close();
+                        },
+                        .bin_digest = file_bin_digest,
+                    };
+                }
+                break :f gop.key_ptr;
+            };
 
-                const actual_stat = this_file.stat() catch |err| {
-                    self.diagnostic = .{ .file_stat = .{
+            const pp = cache_hash_file.prefixed_path;
+            const dir = self.cache.prefixes()[pp.prefix].handle;
+            const this_file = dir.openFile(pp.sub_path, .{ .mode = .read_only }) catch |err| switch (err) {
+                error.FileNotFound => {
+                    // Every digest before this one has been populated successfully.
+                    return .{ .miss = .{ .file_digests_populated = idx } };
+                },
+                else => |e| {
+                    self.diagnostic = .{ .file_open = .{
                         .file_index = idx,
-                        .err = err,
+                        .err = e,
                     } };
                     return error.CacheCheckFailed;
-                };
-                const size_match = actual_stat.size == cache_hash_file.stat.size;
-                const mtime_match = actual_stat.mtime == cache_hash_file.stat.mtime;
-                const inode_match = actual_stat.inode == cache_hash_file.stat.inode;
-
-                if (!size_match or !mtime_match or !inode_match) {
-                    self.manifest_dirty = true;
-
-                    cache_hash_file.stat = .{
-                        .size = actual_stat.size,
-                        .mtime = actual_stat.mtime,
-                        .inode = actual_stat.inode,
-                    };
-
-                    if (self.isProblematicTimestamp(cache_hash_file.stat.mtime)) {
-                        // The actual file has an unreliable timestamp, force it to be hashed
-                        cache_hash_file.stat.mtime = 0;
-                        cache_hash_file.stat.inode = 0;
-                    }
-
-                    var actual_digest: BinDigest = undefined;
-                    hashFile(this_file, &actual_digest) catch |err| {
-                        self.diagnostic = .{ .file_read = .{
-                            .file_index = idx,
-                            .err = err,
-                        } };
-                        return error.CacheCheckFailed;
-                    };
+                },
+            };
+            defer this_file.close();
 
-                    if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) {
-                        cache_hash_file.bin_digest = actual_digest;
-                        // keep going until we have the input file digests
-                        any_file_changed = true;
-                    }
-                }
+            const actual_stat = this_file.stat() catch |err| {
+                self.diagnostic = .{ .file_stat = .{
+                    .file_index = idx,
+                    .err = err,
+                } };
+                return error.CacheCheckFailed;
+            };
+            const size_match = actual_stat.size == cache_hash_file.stat.size;
+            const mtime_match = actual_stat.mtime == cache_hash_file.stat.mtime;
+            const inode_match = actual_stat.inode == cache_hash_file.stat.inode;
+
+            if (!size_match or !mtime_match or !inode_match) {
+                cache_hash_file.stat = .{
+                    .size = actual_stat.size,
+                    .mtime = actual_stat.mtime,
+                    .inode = actual_stat.inode,
+                };
 
-                if (!any_file_changed) {
-                    self.hash.hasher.update(&cache_hash_file.bin_digest);
+                if (self.isProblematicTimestamp(cache_hash_file.stat.mtime)) {
+                    // The actual file has an unreliable timestamp, force it to be hashed
+                    cache_hash_file.stat.mtime = 0;
+                    cache_hash_file.stat.inode = 0;
                 }
-            }
 
-            if (any_file_changed) {
-                if (try self.upgradeToExclusiveLock()) continue;
-                // cache miss
-                // keep the manifest file open
-                self.unhit(bin_digest, input_file_count);
-                return false;
-            }
+                var actual_digest: BinDigest = undefined;
+                hashFile(this_file, &actual_digest) catch |err| {
+                    self.diagnostic = .{ .file_read = .{
+                        .file_index = idx,
+                        .err = err,
+                    } };
+                    return error.CacheCheckFailed;
+                };
 
-            if (idx < input_file_count) {
-                if (try self.upgradeToExclusiveLock()) continue;
-                self.manifest_dirty = true;
-                while (idx < input_file_count) : (idx += 1) {
-                    self.populateFileHash(&self.files.keys()[idx]) catch |err| {
-                        self.diagnostic = .{ .file_hash = .{
-                            .file_index = idx,
-                            .err = err,
-                        } };
-                        return error.CacheCheckFailed;
-                    };
+                if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) {
+                    cache_hash_file.bin_digest = actual_digest;
+                    // keep going until we have the input file digests
+                    any_file_changed = true;
                 }
-                return false;
             }
 
-            if (self.want_shared_lock) {
-                self.downgradeToSharedLock() catch |err| {
-                    self.diagnostic = .{ .manifest_lock = err };
-                    return error.CacheCheckFailed;
-                };
+            if (!any_file_changed) {
+                self.hash.hasher.update(&cache_hash_file.bin_digest);
             }
+        }
 
-            return true;
+        // If the manifest was somehow missing one of our input files, or if any file hash has changed,
+        // then this is a cache miss. However, we have successfully populated some or all of the file
+        // digests.
+        if (any_file_changed or idx < input_file_count) {
+            return .{ .miss = .{ .file_digests_populated = idx } };
         }
+
+        return .hit;
     }
 
+    /// Reset `self.hash.hasher` to the state it should be in after `hit` returns `false`.
+    /// The hasher contains the original input digest, and all original input file digests (i.e.
+    /// not including post files).
+    /// Assumes that `bin_digest` is populated for all files up to `input_file_count`. As such,
+    /// this is not necessarily safe to call within `hit`.
     pub fn unhit(self: *Manifest, bin_digest: BinDigest, input_file_count: usize) void {
         // Reset the hash.
         self.hash.hasher = hasher_init;
lib/std/Build/Step.zig
@@ -759,7 +759,7 @@ fn failWithCacheError(s: *Step, man: *const Build.Cache.Manifest, err: Build.Cac
     switch (err) {
         error.CacheCheckFailed => switch (man.diagnostic) {
             .none => unreachable,
-            .manifest_create, .manifest_read, .manifest_lock => |e| return s.fail("failed to check cache: {s} {s}", .{
+            .manifest_create, .manifest_read, .manifest_lock, .manifest_seek => |e| return s.fail("failed to check cache: {s} {s}", .{
                 @tagName(man.diagnostic), @errorName(e),
             }),
             .file_open, .file_stat, .file_read, .file_hash => |op| {
src/Compilation.zig
@@ -2143,7 +2143,7 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) !void {
             const is_hit = man.hit() catch |err| switch (err) {
                 error.CacheCheckFailed => switch (man.diagnostic) {
                     .none => unreachable,
-                    .manifest_create, .manifest_read, .manifest_lock => |e| return comp.setMiscFailure(
+                    .manifest_create, .manifest_read, .manifest_lock, .manifest_seek => |e| return comp.setMiscFailure(
                         .check_whole_cache,
                         "failed to check cache: {s} {s}",
                         .{ @tagName(man.diagnostic), @errorName(e) },