Commit 048e85f27e

mlugg <mlugg@mlugg.co.uk>
2025-01-10 11:27:32
std.process.Child: add `waitForSpawn`
`std.Build.Step.Run` makes the very reasonable assumption that `error.InvalidExe` will be reported on `spawn` if it will happen. However, this property does not currently hold on POSIX targets. This is, through a slightly convoluted series of events, partially responsible for the sporadic `BrokenPipe` errors we've been seeing more and more in CI runs. Making `spawn` wait for the child to exec in the POSIX path introduces a block of up to 400us. So, instead of doing that, we add a new API for this particular case: `waitForSpawn`. This function is a nop on Windows, but on POSIX it blocks until the child successfully (or otherwise) calls `execvpe`, and reports the error if necessary. `std.Build.Step.Run` calls this function, so that it can get `error.InvalidExe` when it wants it. I'm not convinced that this API is optimal. However, I think this entire API needs to be either heavily refactored or straight-up redesigned (related: #22504), so I'm not too worried about hitting the perfect API: I'd rather just fix this bug for now, and figure out the long-term goal a bit later.
1 parent b8e5685
Changed files (2)
lib
std
Build
Step
process
lib/std/Build/Step/Run.zig
@@ -1354,6 +1354,9 @@ fn spawnChildAndCollect(
             _ = child.kill() catch {};
         }
 
+        // We need to report `error.InvalidExe` *now* if applicable.
+        try child.waitForSpawn();
+
         var timer = try std.time.Timer.start();
 
         const result = if (run.stdio == .zig_test)
lib/std/process/Child.zig
@@ -73,7 +73,7 @@ cwd: ?[]const u8,
 /// Once that is done, `cwd` will be deprecated in favor of this field.
 cwd_dir: ?fs.Dir = null,
 
-err_pipe: ?if (native_os == .windows) void else [2]posix.fd_t,
+err_pipe: if (native_os == .windows) void else ?posix.fd_t,
 
 expand_arg0: Arg0Expand,
 
@@ -211,7 +211,7 @@ pub fn init(argv: []const []const u8, allocator: mem.Allocator) ChildProcess {
         .argv = argv,
         .id = undefined,
         .thread_handle = undefined,
-        .err_pipe = null,
+        .err_pipe = if (native_os == .windows) {} else null,
         .term = null,
         .env_map = null,
         .cwd = null,
@@ -293,17 +293,49 @@ pub fn killPosix(self: *ChildProcess) !Term {
         error.ProcessNotFound => return error.AlreadyTerminated,
         else => return err,
     };
-    self.waitUnwrapped();
+    self.waitUnwrappedPosix();
     return self.term.?;
 }
 
 pub const WaitError = SpawnError || std.os.windows.GetProcessMemoryInfoError;
 
+/// On some targets, `spawn` may not report all spawn errors, such as `error.InvalidExe`.
+/// This function will block until any spawn errors can be reported, and return them.
+pub fn waitForSpawn(self: *ChildProcess) SpawnError!void {
+    if (native_os == .windows) return; // `spawn` reports everything
+    if (self.term) |term| {
+        _ = term catch |spawn_err| return spawn_err;
+        return;
+    }
+
+    const err_pipe = self.err_pipe orelse return;
+    self.err_pipe = null;
+
+    // Wait for the child to report any errors in or before `execvpe`.
+    if (readIntFd(err_pipe)) |child_err_int| {
+        posix.close(err_pipe);
+        const child_err: SpawnError = @errorCast(@errorFromInt(child_err_int));
+        self.term = child_err;
+        return child_err;
+    } else |_| {
+        // Write end closed by CLOEXEC at the time of the `execvpe` call, indicating success!
+        posix.close(err_pipe);
+    }
+}
+
 /// Blocks until child process terminates and then cleans up all resources.
 pub fn wait(self: *ChildProcess) WaitError!Term {
-    const term = if (native_os == .windows) try self.waitWindows() else self.waitPosix();
+    try self.waitForSpawn(); // report spawn errors
+    if (self.term) |term| {
+        self.cleanupStreams();
+        return term;
+    }
+    switch (native_os) {
+        .windows => try self.waitUnwrappedWindows(),
+        else => self.waitUnwrappedPosix(),
+    }
     self.id = undefined;
-    return term;
+    return self.term.?;
 }
 
 pub const RunResult = struct {
@@ -405,26 +437,6 @@ pub fn run(args: struct {
     };
 }
 
-fn waitWindows(self: *ChildProcess) WaitError!Term {
-    if (self.term) |term| {
-        self.cleanupStreams();
-        return term;
-    }
-
-    try self.waitUnwrappedWindows();
-    return self.term.?;
-}
-
-fn waitPosix(self: *ChildProcess) SpawnError!Term {
-    if (self.term) |term| {
-        self.cleanupStreams();
-        return term;
-    }
-
-    self.waitUnwrapped();
-    return self.term.?;
-}
-
 fn waitUnwrappedWindows(self: *ChildProcess) WaitError!void {
     const result = windows.WaitForSingleObjectEx(self.id, windows.INFINITE, false);
 
@@ -447,7 +459,7 @@ fn waitUnwrappedWindows(self: *ChildProcess) WaitError!void {
     return result;
 }
 
-fn waitUnwrapped(self: *ChildProcess) void {
+fn waitUnwrappedPosix(self: *ChildProcess) void {
     const res: posix.WaitPidResult = res: {
         if (self.request_resource_usage_statistics) {
             switch (native_os) {
@@ -469,7 +481,7 @@ fn waitUnwrapped(self: *ChildProcess) void {
 }
 
 fn handleWaitResult(self: *ChildProcess, status: u32) void {
-    self.term = self.cleanupAfterWait(status);
+    self.term = statusToTerm(status);
 }
 
 fn cleanupStreams(self: *ChildProcess) void {
@@ -487,46 +499,6 @@ fn cleanupStreams(self: *ChildProcess) void {
     }
 }
 
-fn cleanupAfterWait(self: *ChildProcess, status: u32) !Term {
-    if (self.err_pipe) |err_pipe| {
-        defer destroyPipe(err_pipe);
-
-        if (native_os == .linux) {
-            var fd = [1]posix.pollfd{posix.pollfd{
-                .fd = err_pipe[0],
-                .events = posix.POLL.IN,
-                .revents = undefined,
-            }};
-
-            // Check if the eventfd buffer stores a non-zero value by polling
-            // it, that's the error code returned by the child process.
-            _ = posix.poll(&fd, 0) catch unreachable;
-
-            // According to eventfd(2) the descriptor is readable if the counter
-            // has a value greater than 0
-            if ((fd[0].revents & posix.POLL.IN) != 0) {
-                const err_int = try readIntFd(err_pipe[0]);
-                return @as(SpawnError, @errorCast(@errorFromInt(err_int)));
-            }
-        } else {
-            // Write maxInt(ErrInt) to the write end of the err_pipe. This is after
-            // waitpid, so this write is guaranteed to be after the child
-            // pid potentially wrote an error. This way we can do a blocking
-            // read on the error pipe and either get maxInt(ErrInt) (no error) or
-            // an error code.
-            try writeIntFd(err_pipe[1], maxInt(ErrInt));
-            const err_int = try readIntFd(err_pipe[0]);
-            // Here we potentially return the fork child's error from the parent
-            // pid.
-            if (err_int != maxInt(ErrInt)) {
-                return @as(SpawnError, @errorCast(@errorFromInt(err_int)));
-            }
-        }
-    }
-
-    return statusToTerm(status);
-}
-
 fn statusToTerm(status: u32) Term {
     return if (posix.W.IFEXITED(status))
         Term{ .Exited = posix.W.EXITSTATUS(status) }
@@ -636,18 +608,9 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void {
         }
     };
 
-    // This pipe is used to communicate errors between the time of fork
-    // and execve from the child process to the parent process.
-    const err_pipe = blk: {
-        if (native_os == .linux) {
-            const fd = try posix.eventfd(0, linux.EFD.CLOEXEC);
-            // There's no distinction between the readable and the writeable
-            // end with eventfd
-            break :blk [2]posix.fd_t{ fd, fd };
-        } else {
-            break :blk try posix.pipe2(.{ .CLOEXEC = true });
-        }
-    };
+    // This pipe communicates to the parent errors in the child between `fork` and `execvpe`.
+    // It is closed by the child (via CLOEXEC) without writing if `execvpe` succeeds.
+    const err_pipe: [2]posix.fd_t = try posix.pipe2(.{ .CLOEXEC = true });
     errdefer destroyPipe(err_pipe);
 
     const pid_result = try posix.fork();
@@ -687,6 +650,11 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void {
     }
 
     // we are the parent
+    errdefer comptime unreachable; // The child is forked; we must not error from now on
+
+    posix.close(err_pipe[1]); // make sure only the child holds the write end open
+    self.err_pipe = err_pipe[0];
+
     const pid: i32 = @intCast(pid_result);
     if (self.stdin_behavior == .Pipe) {
         self.stdin = .{ .handle = stdin_pipe[1] };
@@ -705,7 +673,6 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void {
     }
 
     self.id = pid;
-    self.err_pipe = err_pipe;
     self.term = null;
 
     if (self.stdin_behavior == .Pipe) {