Commit d37ee79535

Andrew Kelley <andrew@ziglang.org>
2024-12-11 02:43:42
std.Build.Cache.hit: more discipline in error handling
Previous commits 2b0929929d67e222ca6a9523a3a594ed456c4a51 4ea2f441df36cec61e1017f4d795d4037326c98c had this text: > There are no dir components, so you would think that this was > unreachable, however we have observed on macOS two processes racing to > do openat() with O_CREAT manifest in ENOENT. This appears to have been a misunderstanding based on the issue report #12138 and corresponding PR #12139 in which the steps to reproduce removed the cache directory in a loop which also executed detached Zig compiler processes. There is no evidence for the macOS kernel bug however the ENOENT is easily explained by the removal of the cache directory. This commit reverts those commits, ultimately reporting the ENOENT as an error rather than repeating the create file operation. However this commit also adds an explicit error set to `std.Build.Cache.hit` as well as changing the `failed_file_index` to a proper diagnostic field that fully communicates what failed, leading to more informative error messages on failure to check the cache. The equivalent failure when occuring for AstGen performs a fatal process kill, reasoning being that the compiler has an invariant of the cache directory not being yanked out from underneath it while executing. This could be made a more granular error in the future but I suspect such thing is not valuable to pursue. Related to #18340 but does not solve it.
1 parent c172877
lib/std/Build/Cache.zig
@@ -40,7 +40,7 @@ pub fn addPrefix(cache: *Cache, directory: Directory) void {
 
 /// Be sure to call `Manifest.deinit` after successful initialization.
 pub fn obtain(cache: *Cache) Manifest {
-    return Manifest{
+    return .{
         .cache = cache,
         .hash = cache.hash,
         .manifest_file = null,
@@ -99,9 +99,9 @@ fn findPrefixResolved(cache: *const Cache, resolved_path: []u8) !PrefixedPath {
 }
 
 fn getPrefixSubpath(allocator: Allocator, prefix: []const u8, path: []u8) ![]u8 {
-    const relative = try std.fs.path.relative(allocator, prefix, path);
+    const relative = try fs.path.relative(allocator, prefix, path);
     errdefer allocator.free(relative);
-    var component_iterator = std.fs.path.NativeComponentIterator.init(relative) catch {
+    var component_iterator = fs.path.NativeComponentIterator.init(relative) catch {
         return error.NotASubPath;
     };
     if (component_iterator.root() != null) {
@@ -327,13 +327,27 @@ pub const Manifest = struct {
     want_refresh_timestamp: bool = true,
     files: Files = .{},
     hex_digest: HexDigest,
-    /// Populated when hit() returns an error because of one
-    /// of the files listed in the manifest.
-    failed_file_index: ?usize = null,
+    diagnostic: Diagnostic = .none,
     /// Keeps track of the last time we performed a file system write to observe
     /// what time the file system thinks it is, according to its own granularity.
     recent_problematic_timestamp: i128 = 0,
 
+    pub const Diagnostic = union(enum) {
+        none,
+        manifest_create: fs.File.OpenError,
+        manifest_read: fs.File.ReadError,
+        manifest_lock: fs.File.LockError,
+        file_open: FileOp,
+        file_stat: FileOp,
+        file_read: FileOp,
+        file_hash: FileOp,
+
+        pub const FileOp = struct {
+            file_index: usize,
+            err: anyerror,
+        };
+    };
+
     pub const Files = std.ArrayHashMapUnmanaged(File, void, FilesContext, false);
 
     pub const FilesContext = struct {
@@ -452,6 +466,15 @@ pub const Manifest = struct {
         return self.addDepFileMaybePost(dir, dep_file_basename);
     }
 
+    pub const HitError = error{
+        /// Unable to check the cache for a reason that has been recorded into
+        /// the `diagnostic` field.
+        CacheCheckFailed,
+        /// A cache manifest file exists however it could not be parsed.
+        InvalidFormat,
+        OutOfMemory,
+    };
+
     /// Check the cache to see if the input exists in it. If it exists, returns `true`.
     /// A hex encoding of its hash is available by calling `final`.
     ///
@@ -464,11 +487,11 @@ pub const Manifest = struct {
     /// The lock on the manifest file is released when `deinit` is called. As another
     /// option, one may call `toOwnedLock` to obtain a smaller object which can represent
     /// the lock. `deinit` is safe to call whether or not `toOwnedLock` has been called.
-    pub fn hit(self: *Manifest) !bool {
+    pub fn hit(self: *Manifest) HitError!bool {
         const gpa = self.cache.gpa;
         assert(self.manifest_file == null);
 
-        self.failed_file_index = null;
+        self.diagnostic = .none;
 
         const ext = ".txt";
         var manifest_file_path: [hex_digest_len + ext.len]u8 = undefined;
@@ -496,17 +519,19 @@ pub const Manifest = struct {
                 break;
             } else |err| switch (err) {
                 error.WouldBlock => {
-                    self.manifest_file = try self.cache.manifest_dir.openFile(&manifest_file_path, .{
+                    self.manifest_file = self.cache.manifest_dir.openFile(&manifest_file_path, .{
                         .mode = .read_write,
                         .lock = .shared,
-                    });
+                    }) catch |e| {
+                        self.diagnostic = .{ .manifest_create = e };
+                        return error.CacheCheckFailed;
+                    };
                     break;
                 },
-                // There are no dir components, so you would think that this was
-                // unreachable, however we have observed on macOS two processes racing
-                // to do openat() with O_CREAT manifest in ENOENT.
-                error.FileNotFound => continue,
-                else => |e| return e,
+                else => |e| {
+                    self.diagnostic = .{ .manifest_create = e };
+                    return error.CacheCheckFailed;
+                },
             }
         }
 
@@ -514,7 +539,14 @@ pub const Manifest = struct {
 
         const input_file_count = self.files.entries.len;
         while (true) : (self.unhit(bin_digest, input_file_count)) {
-            const file_contents = try self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max);
+            const file_contents = self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max) catch |err| switch (err) {
+                error.OutOfMemory => return error.OutOfMemory,
+                error.StreamTooLong => return error.OutOfMemory,
+                else => |e| {
+                    self.diagnostic = .{ .manifest_read = e };
+                    return error.CacheCheckFailed;
+                },
+            };
             defer gpa.free(file_contents);
 
             var any_file_changed = false;
@@ -526,8 +558,11 @@ pub const Manifest = struct {
                 while (idx < input_file_count) : (idx += 1) {
                     const ch_file = &self.files.keys()[idx];
                     self.populateFileHash(ch_file) catch |err| {
-                        self.failed_file_index = idx;
-                        return err;
+                        self.diagnostic = .{ .file_hash = .{
+                            .file_index = idx,
+                            .err = err,
+                        } };
+                        return error.CacheCheckFailed;
                     };
                 }
                 return false;
@@ -605,13 +640,22 @@ pub const Manifest = struct {
                         if (try self.upgradeToExclusiveLock()) continue;
                         return false;
                     },
-                    else => return error.CacheUnavailable,
+                    else => |e| {
+                        self.diagnostic = .{ .file_open = .{
+                            .file_index = idx,
+                            .err = e,
+                        } };
+                        return error.CacheCheckFailed;
+                    },
                 };
                 defer this_file.close();
 
                 const actual_stat = this_file.stat() catch |err| {
-                    self.failed_file_index = idx;
-                    return err;
+                    self.diagnostic = .{ .file_stat = .{
+                        .file_index = idx,
+                        .err = err,
+                    } };
+                    return error.CacheCheckFailed;
                 };
                 const size_match = actual_stat.size == cache_hash_file.stat.size;
                 const mtime_match = actual_stat.mtime == cache_hash_file.stat.mtime;
@@ -634,8 +678,11 @@ pub const Manifest = struct {
 
                     var actual_digest: BinDigest = undefined;
                     hashFile(this_file, &actual_digest) catch |err| {
-                        self.failed_file_index = idx;
-                        return err;
+                        self.diagnostic = .{ .file_read = .{
+                            .file_index = idx,
+                            .err = err,
+                        } };
+                        return error.CacheCheckFailed;
                     };
 
                     if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) {
@@ -662,17 +709,22 @@ pub const Manifest = struct {
                 if (try self.upgradeToExclusiveLock()) continue;
                 self.manifest_dirty = true;
                 while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.keys()[idx];
-                    self.populateFileHash(ch_file) catch |err| {
-                        self.failed_file_index = idx;
-                        return err;
+                    self.populateFileHash(&self.files.keys()[idx]) catch |err| {
+                        self.diagnostic = .{ .file_hash = .{
+                            .file_index = idx,
+                            .err = err,
+                        } };
+                        return error.CacheCheckFailed;
                     };
                 }
                 return false;
             }
 
             if (self.want_shared_lock) {
-                try self.downgradeToSharedLock();
+                self.downgradeToSharedLock() catch |err| {
+                    self.diagnostic = .{ .manifest_lock = err };
+                    return error.CacheCheckFailed;
+                };
             }
 
             return true;
@@ -1010,7 +1062,7 @@ pub const Manifest = struct {
         self.have_exclusive_lock = false;
     }
 
-    fn upgradeToExclusiveLock(self: *Manifest) !bool {
+    fn upgradeToExclusiveLock(self: *Manifest) error{CacheCheckFailed}!bool {
         if (self.have_exclusive_lock) return false;
         assert(self.manifest_file != null);
 
@@ -1022,7 +1074,10 @@ pub const Manifest = struct {
             // Here we intentionally have a period where the lock is released, in case there are
             // other processes holding a shared lock.
             manifest_file.unlock();
-            try manifest_file.lock(.exclusive);
+            manifest_file.lock(.exclusive) catch |err| {
+                self.diagnostic = .{ .manifest_lock = err };
+                return error.CacheCheckFailed;
+            };
         }
         self.have_exclusive_lock = true;
         return true;
@@ -1132,7 +1187,7 @@ pub fn writeSmallFile(dir: fs.Dir, sub_path: []const u8, data: []const u8) !void
     }
 }
 
-fn hashFile(file: fs.File, bin_digest: *[Hasher.mac_length]u8) !void {
+fn hashFile(file: fs.File, bin_digest: *[Hasher.mac_length]u8) fs.File.PReadError!void {
     var buf: [1024]u8 = undefined;
     var hasher = hasher_init;
     var off: u64 = 0;
lib/std/Build/Step.zig
@@ -754,11 +754,24 @@ pub fn cacheHitAndWatch(s: *Step, man: *Build.Cache.Manifest) !bool {
     return is_hit;
 }
 
-fn failWithCacheError(s: *Step, man: *const Build.Cache.Manifest, err: anyerror) anyerror {
-    const i = man.failed_file_index orelse return err;
-    const pp = man.files.keys()[i].prefixed_path;
-    const prefix = man.cache.prefixes()[pp.prefix].path orelse "";
-    return s.fail("{s}: {s}/{s}", .{ @errorName(err), prefix, pp.sub_path });
+fn failWithCacheError(s: *Step, man: *const Build.Cache.Manifest, err: Build.Cache.Manifest.HitError) error{ OutOfMemory, MakeFailed } {
+    switch (err) {
+        error.CacheCheckFailed => switch (man.diagnostic) {
+            .none => unreachable,
+            .manifest_create, .manifest_read, .manifest_lock => |e| return s.fail("failed to check cache: {s} {s}", .{
+                @tagName(man.diagnostic), @errorName(e),
+            }),
+            .file_open, .file_stat, .file_read, .file_hash => |op| {
+                const pp = man.files.keys()[op.file_index].prefixed_path;
+                const prefix = man.cache.prefixes()[pp.prefix].path orelse "";
+                return s.fail("failed to check cache: '{s}{s}' {s} {s}", .{
+                    prefix, pp.sub_path, @tagName(man.diagnostic), @errorName(op.err),
+                });
+            },
+        },
+        error.OutOfMemory => return error.OutOfMemory,
+        error.InvalidFormat => return s.fail("failed check cache: invalid manifest file format", .{}),
+    }
 }
 
 /// Prefer `writeManifestAndWatch` unless you already added watch inputs
lib/std/posix.zig
@@ -1537,6 +1537,10 @@ pub const OpenError = error{
     ProcessFdQuotaExceeded,
     SystemFdQuotaExceeded,
     NoDevice,
+    /// Either:
+    /// * One of the path components does not exist.
+    /// * Cwd was used, but cwd has been deleted.
+    /// * The path associated with the open directory handle has been deleted.
     FileNotFound,
 
     /// The path exceeded `max_path_bytes` bytes.
src/Zcu/PerThread.zig
@@ -135,10 +135,14 @@ pub fn astGenFile(
             error.PipeBusy => unreachable, // it's not a pipe
             error.NoDevice => unreachable, // it's not a pipe
             error.WouldBlock => unreachable, // not asking for non-blocking I/O
-            // There are no dir components, so you would think that this was
-            // unreachable, however we have observed on macOS two processes racing
-            // to do openat() with O_CREAT manifest in ENOENT.
-            error.FileNotFound => continue,
+            error.FileNotFound => {
+                // Since there are no dir components this could only occur if
+                // `zir_dir` is deleted after the compiler process obtains an
+                // open directory handle.
+                std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{
+                    cache_directory,
+                });
+            },
 
             else => |e| return e, // Retryable errors are handled at callsite.
         };
src/Compilation.zig
@@ -2048,15 +2048,30 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) !void {
             whole.cache_manifest = &man;
             try addNonIncrementalStuffToCacheManifest(comp, arena, &man);
 
-            const is_hit = man.hit() catch |err| {
-                const i = man.failed_file_index orelse return err;
-                const pp = man.files.keys()[i].prefixed_path;
-                const prefix = man.cache.prefixes()[pp.prefix];
-                return comp.setMiscFailure(
+            const is_hit = man.hit() catch |err| switch (err) {
+                error.CacheCheckFailed => switch (man.diagnostic) {
+                    .none => unreachable,
+                    .manifest_create, .manifest_read, .manifest_lock => |e| return comp.setMiscFailure(
+                        .check_whole_cache,
+                        "failed to check cache: {s} {s}",
+                        .{ @tagName(man.diagnostic), @errorName(e) },
+                    ),
+                    .file_open, .file_stat, .file_read, .file_hash => |op| {
+                        const pp = man.files.keys()[op.file_index].prefixed_path;
+                        const prefix = man.cache.prefixes()[pp.prefix];
+                        return comp.setMiscFailure(
+                            .check_whole_cache,
+                            "failed to check cache: '{}{s}' {s} {s}",
+                            .{ prefix, pp.sub_path, @tagName(man.diagnostic), @errorName(op.err) },
+                        );
+                    },
+                },
+                error.OutOfMemory => return error.OutOfMemory,
+                error.InvalidFormat => return comp.setMiscFailure(
                     .check_whole_cache,
-                    "unable to check cache: stat file '{}{s}' failed: {s}",
-                    .{ prefix, pp.sub_path, @errorName(err) },
-                );
+                    "failed check cache: invalid manifest file format",
+                    .{},
+                ),
             };
             if (is_hit) {
                 // In this case the cache hit contains the full set of file system inputs. Nice!
src/link.zig
@@ -768,7 +768,7 @@ pub const File = struct {
     /// TODO audit this error set. most of these should be collapsed into one error,
     /// and Diags.Flags should be updated to convey the meaning to the user.
     pub const FlushError = error{
-        CacheUnavailable,
+        CacheCheckFailed,
         CurrentWorkingDirectoryUnlinked,
         DivisionByZero,
         DllImportLibraryNotFound,