Commit 1a99c99ee9

mlugg <mlugg@mlugg.co.uk>
2024-11-27 01:10:33
std.Build: gracefully handle child stdin closing when running tests
We have deduced that it seems the sporadic BrokenPipe failures happening on the CI runners (e.g. https://github.com/ziglang/zig/actions/runs/12035916948/job/33555963190) are likely caused by the test runner's stdin pipe abnormally closing, likely due to the process crashing. Here, we introduce error handling for this case, so that if these writes fail, the step is marked as failed correctly, and we still collect the child's stderr to report. This won't fix the CI issues, but it should promote them to proper error messages including child stderr, which -- at least in theory -- should allow us to ultimately track down where the errors come from. Note that this change is desirable regardless of bugs in the test runner or similar, since the child process could terminate abnormally for any number of reasons (e.g. a crashing test), and such cases should be correctly reported by the build runner.
1 parent 3ce6de8
Changed files (1)
lib
std
Build
Step
lib/std/Build/Step/Run.zig
@@ -1413,12 +1413,23 @@ fn evalZigTest(
     });
     defer poller.deinit();
 
-    if (fuzz_context) |fuzz| {
-        try sendRunTestMessage(child.stdin.?, .start_fuzzing, fuzz.unit_test_index);
-    } else {
+    // If this is `true`, we avoid ever entering the polling loop below, because the stdin pipe has
+    // somehow already closed; instead, we go straight to capturing stderr in case it has anything
+    // useful.
+    const first_write_failed = if (fuzz_context) |fuzz| failed: {
+        sendRunTestMessage(child.stdin.?, .start_fuzzing, fuzz.unit_test_index) catch |err| {
+            try run.step.addError("unable to write stdin: {s}", .{@errorName(err)});
+            break :failed true;
+        };
+        break :failed false;
+    } else failed: {
         run.fuzz_tests.clearRetainingCapacity();
-        try sendMessage(child.stdin.?, .query_test_metadata);
-    }
+        sendMessage(child.stdin.?, .query_test_metadata) catch |err| {
+            try run.step.addError("unable to write stdin: {s}", .{@errorName(err)});
+            break :failed true;
+        };
+        break :failed false;
+    };
 
     const Header = std.zig.Server.Message.Header;
 
@@ -1437,13 +1448,13 @@ fn evalZigTest(
     var sub_prog_node: ?std.Progress.Node = null;
     defer if (sub_prog_node) |n| n.end();
 
-    poll: while (true) {
+    const any_write_failed = first_write_failed or poll: while (true) {
         while (stdout.readableLength() < @sizeOf(Header)) {
-            if (!(try poller.poll())) break :poll;
+            if (!(try poller.poll())) break :poll false;
         }
         const header = stdout.reader().readStruct(Header) catch unreachable;
         while (stdout.readableLength() < header.bytes_len) {
-            if (!(try poller.poll())) break :poll;
+            if (!(try poller.poll())) break :poll false;
         }
         const body = stdout.readableSliceOfLen(header.bytes_len);
 
@@ -1483,7 +1494,10 @@ fn evalZigTest(
                     .prog_node = prog_node,
                 };
 
-                try requestNextTest(child.stdin.?, &metadata.?, &sub_prog_node);
+                requestNextTest(child.stdin.?, &metadata.?, &sub_prog_node) catch |err| {
+                    try run.step.addError("unable to write stdin: {s}", .{@errorName(err)});
+                    break :poll true;
+                };
             },
             .test_results => {
                 assert(fuzz_context == null);
@@ -1518,7 +1532,10 @@ fn evalZigTest(
                     }
                 }
 
-                try requestNextTest(child.stdin.?, &metadata.?, &sub_prog_node);
+                requestNextTest(child.stdin.?, &metadata.?, &sub_prog_node) catch |err| {
+                    try run.step.addError("unable to write stdin: {s}", .{@errorName(err)});
+                    break :poll true;
+                };
             },
             .coverage_id => {
                 const web_server = fuzz_context.?.web_server;
@@ -1552,6 +1569,12 @@ fn evalZigTest(
         }
 
         stdout.discard(body.len);
+    };
+
+    if (any_write_failed) {
+        // The compiler unexpectedly closed stdin; something is very wrong and has probably crashed.
+        // We want to make sure we've captured all of stderr so that it's logged below.
+        while (try poller.poll()) {}
     }
 
     if (stderr.readableLength() > 0) {