Commit 7ff42eff91

Andrew Kelley <andrew@ziglang.org>
2024-12-11 05:44:00
std.Build.Cache.hit: work around macOS kernel bug
The previous commit cast doubt upon the initial report about macOS kernel behavior, identifying another reason that ENOENT could be returned from file creation. However, it is demonstrable that ENOENT can be returned for both cases: 1. create file race 2. handle refers to deleted directory This commit re-introduces the workaround for the file creation race on macOS however it does not unconditionally retry - it first tries again with O_EXCL to disambiguate the error condition that has occurred.
1 parent d37ee79
Changed files (5)
lib/std/Build/Cache.zig
@@ -528,6 +528,43 @@ pub const Manifest = struct {
                     };
                     break;
                 },
+                error.FileNotFound => {
+                    // There are no dir components, so the only possibility
+                    // should be that the directory behind the handle has been
+                    // deleted, however we have observed on macOS two processes
+                    // racing to do openat() with O_CREAT manifest in ENOENT.
+                    //
+                    // As a workaround, we retry with exclusive=true which
+                    // disambiguates by returning EEXIST, indicating original
+                    // failure was a race, or ENOENT, indicating deletion of
+                    // the directory of our open handle.
+                    if (builtin.os.tag != .macos) {
+                        self.diagnostic = .{ .manifest_create = error.FileNotFound };
+                        return error.CacheCheckFailed;
+                    }
+
+                    if (self.cache.manifest_dir.createFile(&manifest_file_path, .{
+                        .read = true,
+                        .truncate = false,
+                        .lock = .exclusive,
+                        .lock_nonblocking = self.want_shared_lock,
+                        .exclusive = true,
+                    })) |manifest_file| {
+                        self.manifest_file = manifest_file;
+                        self.have_exclusive_lock = true;
+                        break;
+                    } else |excl_err| switch (excl_err) {
+                        error.WouldBlock, error.PathAlreadyExists => continue,
+                        error.FileNotFound => {
+                            self.diagnostic = .{ .manifest_create = error.FileNotFound };
+                            return error.CacheCheckFailed;
+                        },
+                        else => |e| {
+                            self.diagnostic = .{ .manifest_create = e };
+                            return error.CacheCheckFailed;
+                        },
+                    }
+                },
                 else => |e| {
                     self.diagnostic = .{ .manifest_create = e };
                     return error.CacheCheckFailed;
lib/std/Build/Step.zig
@@ -770,7 +770,7 @@ fn failWithCacheError(s: *Step, man: *const Build.Cache.Manifest, err: Build.Cac
             },
         },
         error.OutOfMemory => return error.OutOfMemory,
-        error.InvalidFormat => return s.fail("failed check cache: invalid manifest file format", .{}),
+        error.InvalidFormat => return s.fail("failed to check cache: invalid manifest file format", .{}),
     }
 }
 
lib/std/posix.zig
@@ -1541,6 +1541,8 @@ pub const OpenError = error{
     /// * One of the path components does not exist.
     /// * Cwd was used, but cwd has been deleted.
     /// * The path associated with the open directory handle has been deleted.
+    /// * On macOS, multiple processes or threads raced to create the same file
+    ///   with `O.EXCL` set to `false`.
     FileNotFound,
 
     /// The path exceeded `max_path_bytes` bytes.
src/Zcu/PerThread.zig
@@ -136,12 +136,34 @@ pub fn astGenFile(
             error.NoDevice => unreachable, // it's not a pipe
             error.WouldBlock => unreachable, // not asking for non-blocking I/O
             error.FileNotFound => {
-                // Since there are no dir components this could only occur if
-                // `zir_dir` is deleted after the compiler process obtains an
-                // open directory handle.
-                std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{
-                    cache_directory,
-                });
+                // There are no dir components, so the only possibility should
+                // be that the directory behind the handle has been deleted,
+                // however we have observed on macOS two processes racing to do
+                // openat() with O_CREAT manifest in ENOENT.
+                //
+                // As a workaround, we retry with exclusive=true which
+                // disambiguates by returning EEXIST, indicating original
+                // failure was a race, or ENOENT, indicating deletion of the
+                // directory of our open handle.
+                if (builtin.os.tag != .macos) {
+                    std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{
+                        cache_directory,
+                    });
+                }
+                break zir_dir.createFile(&hex_digest, .{
+                    .read = true,
+                    .truncate = false,
+                    .lock = lock,
+                    .exclusive = true,
+                }) catch |excl_err| switch (excl_err) {
+                    error.PathAlreadyExists => continue,
+                    error.FileNotFound => {
+                        std.process.fatal("cache directory '{}' unexpectedly removed during compiler execution", .{
+                            cache_directory,
+                        });
+                    },
+                    else => |e| return e,
+                };
             },
 
             else => |e| return e, // Retryable errors are handled at callsite.
src/Compilation.zig
@@ -2069,7 +2069,7 @@ pub fn update(comp: *Compilation, main_progress_node: std.Progress.Node) !void {
                 error.OutOfMemory => return error.OutOfMemory,
                 error.InvalidFormat => return comp.setMiscFailure(
                     .check_whole_cache,
-                    "failed check cache: invalid manifest file format",
+                    "failed to check cache: invalid manifest file format",
                     .{},
                 ),
             };