Commit cf4bccf765

Andrew Kelley <andrew@ziglang.org>
2019-09-12 02:22:49
improvements targeted at improving async functions
* Reuse bytes of async function frames when non-async functions make `noasync` calls. This prevents explosive stack growth. * Zig now passes a stack size argument to the linker when linking ELF binaries. Linux ignores this value, but it is available as a program header called GNU_STACK. I prototyped some code that memory maps extra space to the stack using this program header, but there was still a problem when accessing stack memory very far down. Stack probing is needed or not working or something. I also prototyped using `@newStackCall` to call main and that does work around the issue but it also brings its own issues. That code is commented out for now in std/special/start.zig. I'm on a plane with no Internet, but I plan to consult with the musl community for advice when I get a chance. * Added `noasync` to a bunch of function calls in std.debug. It's very messy but it's a workaround that makes stack traces functional with evented I/O enabled. Eventually these will be cleaned up as the root bugs are found and fixed. Programs built in blocking mode are unaffected. * Lowered the default stack size of std.io.InStream (for the async version) to 1 MiB instead of 4. Until we figure out how to get choosing a stack size working (see 2nd bullet point above), 4 MiB tends to cause segfaults due to stack size running out, or usage of stack memory too far apart, or something like that. * Default thread stack size is bumped from 8 MiB to 16 to match the size we give for the main thread. It's planned to eventually remove this hard coded value and have Zig able to determine this value during semantic analysis, with call graph analysis and function pointer annotations and extern function annotations.
1 parent 68b49f7
Changed files (7)
src/codegen.cpp
@@ -7184,6 +7184,9 @@ static void do_code_gen(CodeGen *g) {
 
         if (!is_async) {
             // allocate async frames for noasync calls & awaits to async functions
+            ZigType *largest_call_frame_type = nullptr;
+            IrInstruction *all_calls_alloca = ir_create_alloca(g, &fn_table_entry->fndef_scope->base,
+                    fn_table_entry->body_node, fn_table_entry, g->builtin_types.entry_void, "@async_call_frame");
             for (size_t i = 0; i < fn_table_entry->call_list.length; i += 1) {
                 IrInstructionCallGen *call = fn_table_entry->call_list.at(i);
                 if (call->fn_entry == nullptr)
@@ -7195,8 +7198,15 @@ static void do_code_gen(CodeGen *g) {
                 if (call->frame_result_loc != nullptr)
                     continue;
                 ZigType *callee_frame_type = get_fn_frame_type(g, call->fn_entry);
-                call->frame_result_loc = ir_create_alloca(g, call->base.scope, call->base.source_node,
-                        fn_table_entry, callee_frame_type, "");
+                if (largest_call_frame_type == nullptr ||
+                    callee_frame_type->abi_size > largest_call_frame_type->abi_size)
+                {
+                    largest_call_frame_type = callee_frame_type;
+                }
+                call->frame_result_loc = all_calls_alloca;
+            }
+            if (largest_call_frame_type != nullptr) {
+                all_calls_alloca->value.type = get_pointer_to_type(g, largest_call_frame_type, false);
             }
             // allocate temporary stack data
             for (size_t alloca_i = 0; alloca_i < fn_table_entry->alloca_gen_list.length; alloca_i += 1) {
src/link.cpp
@@ -1615,6 +1615,11 @@ static void construct_linker_job_elf(LinkJob *lj) {
 
     lj->args.append("-error-limit=0");
 
+    if (g->out_type == OutTypeExe) {
+        lj->args.append("-z");
+        lj->args.append("stack-size=16777216"); // default to 16 MiB
+    }
+
     if (g->linker_script) {
         lj->args.append("-T");
         lj->args.append(g->linker_script);
std/io/in_stream.zig
@@ -6,7 +6,7 @@ const assert = std.debug.assert;
 const mem = std.mem;
 const Buffer = std.Buffer;
 
-pub const default_stack_size = 4 * 1024 * 1024;
+pub const default_stack_size = 1 * 1024 * 1024;
 pub const stack_size: usize = if (@hasDecl(root, "stack_size_std_io_InStream"))
     root.stack_size_std_io_InStream
 else
std/os/linux/tls.zig
@@ -125,7 +125,7 @@ pub fn setThreadPointer(addr: usize) void {
     }
 }
 
-pub fn initTLS() void {
+pub fn initTLS() ?*elf.Phdr {
     var tls_phdr: ?*elf.Phdr = null;
     var img_base: usize = 0;
 
@@ -152,10 +152,13 @@ pub fn initTLS() void {
     // Search the TLS section
     const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum];
 
+    var gnu_stack: ?*elf.Phdr = null;
+
     for (phdrs) |*phdr| {
         switch (phdr.p_type) {
             elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr,
             elf.PT_TLS => tls_phdr = phdr,
+            elf.PT_GNU_STACK => gnu_stack = phdr,
             else => continue,
         }
     }
@@ -217,6 +220,8 @@ pub fn initTLS() void {
             .data_offset = data_offset,
         };
     }
+
+    return gnu_stack;
 }
 
 pub fn copyTLS(addr: usize) usize {
std/special/start.zig
@@ -5,7 +5,7 @@ const std = @import("std");
 const builtin = @import("builtin");
 const assert = std.debug.assert;
 
-var argc_ptr: [*]usize = undefined;
+var starting_stack_ptr: [*]usize = undefined;
 
 const is_wasm = switch (builtin.arch) {
     .wasm32, .wasm64 => true,
@@ -35,17 +35,17 @@ nakedcc fn _start() noreturn {
 
     switch (builtin.arch) {
         .x86_64 => {
-            argc_ptr = asm (""
+            starting_stack_ptr = asm (""
                 : [argc] "={rsp}" (-> [*]usize)
             );
         },
         .i386 => {
-            argc_ptr = asm (""
+            starting_stack_ptr = asm (""
                 : [argc] "={esp}" (-> [*]usize)
             );
         },
         .aarch64, .aarch64_be, .arm => {
-            argc_ptr = asm ("mov %[argc], sp"
+            starting_stack_ptr = asm ("mov %[argc], sp"
                 : [argc] "=r" (-> [*]usize)
             );
         },
@@ -72,8 +72,8 @@ fn posixCallMainAndExit() noreturn {
     if (builtin.os == builtin.Os.freebsd) {
         @setAlignStack(16);
     }
-    const argc = argc_ptr[0];
-    const argv = @ptrCast([*][*]u8, argc_ptr + 1);
+    const argc = starting_stack_ptr[0];
+    const argv = @ptrCast([*][*]u8, starting_stack_ptr + 1);
 
     const envp_optional = @ptrCast([*]?[*]u8, argv + argc + 1);
     var envp_count: usize = 0;
@@ -85,21 +85,40 @@ fn posixCallMainAndExit() noreturn {
         const auxv = @ptrCast([*]std.elf.Auxv, envp.ptr + envp_count + 1);
         std.os.linux.elf_aux_maybe = auxv;
         // Initialize the TLS area
-        std.os.linux.tls.initTLS();
+        const gnu_stack_phdr = std.os.linux.tls.initTLS() orelse @panic("ELF missing stack size");
 
         if (std.os.linux.tls.tls_image) |tls_img| {
             const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size);
             const tp = std.os.linux.tls.copyTLS(tls_addr);
             std.os.linux.tls.setThreadPointer(tp);
         }
+
+        // TODO This is disabled because what should we do when linking libc and this code
+        // does not execute? And also it's causing a test failure in stack traces in release modes.
+
+        //// Linux ignores the stack size from the ELF file, and instead always does 8 MiB. A further
+        //// problem is that it uses PROT_GROWSDOWN which prevents stores to addresses too far down
+        //// the stack and requires "probing". So here we allocate our own stack.
+        //const wanted_stack_size = gnu_stack_phdr.p_memsz;
+        //assert(wanted_stack_size % std.mem.page_size == 0);
+        //// Allocate an extra page as the guard page.
+        //const total_size = wanted_stack_size + std.mem.page_size;
+        //const new_stack = std.os.mmap(
+        //    null,
+        //    total_size,
+        //    std.os.PROT_READ | std.os.PROT_WRITE,
+        //    std.os.MAP_PRIVATE | std.os.MAP_ANONYMOUS,
+        //    -1,
+        //    0,
+        //) catch @panic("out of memory");
+        //std.os.mprotect(new_stack[0..std.mem.page_size], std.os.PROT_NONE) catch {};
+        //std.os.exit(@newStackCall(new_stack, callMainWithArgs, argc, argv, envp));
     }
 
-    std.os.exit(callMainWithArgs(argc, argv, envp));
+    std.os.exit(@inlineCall(callMainWithArgs, argc, argv, envp));
 }
 
-// This is marked inline because for some reason LLVM in release mode fails to inline it,
-// and we want fewer call frames in stack traces.
-inline fn callMainWithArgs(argc: usize, argv: [*][*]u8, envp: [][*]u8) u8 {
+fn callMainWithArgs(argc: usize, argv: [*][*]u8, envp: [][*]u8) u8 {
     std.os.argv = argv[0..argc];
     std.os.environ = envp;
 
@@ -112,7 +131,7 @@ extern fn main(c_argc: i32, c_argv: [*][*]u8, c_envp: [*]?[*]u8) i32 {
     var env_count: usize = 0;
     while (c_envp[env_count] != null) : (env_count += 1) {}
     const envp = @ptrCast([*][*]u8, c_envp)[0..env_count];
-    return callMainWithArgs(@intCast(usize, c_argc), c_argv, envp);
+    return @inlineCall(callMainWithArgs, @intCast(usize, c_argc), c_argv, envp);
 }
 
 // General error message for a malformed return type
std/debug.zig
@@ -1478,10 +1478,11 @@ const LineNumberProgram = struct {
     }
 };
 
+// TODO the noasyncs here are workarounds
 fn readStringRaw(allocator: *mem.Allocator, in_stream: var) ![]u8 {
     var buf = ArrayList(u8).init(allocator);
     while (true) {
-        const byte = try in_stream.readByte();
+        const byte = try noasync in_stream.readByte();
         if (byte == 0) break;
         try buf.append(byte);
     }
@@ -1494,10 +1495,11 @@ fn getString(di: *DwarfInfo, offset: u64) ![]u8 {
     return di.readString();
 }
 
+// TODO the noasyncs here are workarounds
 fn readAllocBytes(allocator: *mem.Allocator, in_stream: var, size: usize) ![]u8 {
     const buf = try allocator.alloc(u8, size);
     errdefer allocator.free(buf);
-    if ((try in_stream.read(buf)) < size) return error.EndOfFile;
+    if ((try noasync in_stream.read(buf)) < size) return error.EndOfFile;
     return buf;
 }
 
@@ -1506,8 +1508,9 @@ fn parseFormValueBlockLen(allocator: *mem.Allocator, in_stream: var, size: usize
     return FormValue{ .Block = buf };
 }
 
+// TODO the noasyncs here are workarounds
 fn parseFormValueBlock(allocator: *mem.Allocator, in_stream: var, size: usize) !FormValue {
-    const block_len = try in_stream.readVarInt(usize, builtin.Endian.Little, size);
+    const block_len = try noasync in_stream.readVarInt(usize, builtin.Endian.Little, size);
     return parseFormValueBlockLen(allocator, in_stream, block_len);
 }
 
@@ -1537,27 +1540,37 @@ fn parseFormValueConstant(allocator: *mem.Allocator, in_stream: var, signed: boo
     };
 }
 
+// TODO the noasyncs here are workarounds
 fn parseFormValueDwarfOffsetSize(in_stream: var, is_64: bool) !u64 {
-    return if (is_64) try in_stream.readIntLittle(u64) else u64(try in_stream.readIntLittle(u32));
+    return if (is_64) try noasync in_stream.readIntLittle(u64) else u64(try noasync in_stream.readIntLittle(u32));
 }
 
+// TODO the noasyncs here are workarounds
 fn parseFormValueTargetAddrSize(in_stream: var) !u64 {
-    return if (@sizeOf(usize) == 4) u64(try in_stream.readIntLittle(u32)) else if (@sizeOf(usize) == 8) try in_stream.readIntLittle(u64) else unreachable;
+    if (@sizeOf(usize) == 4) {
+        return u64(try noasync in_stream.readIntLittle(u32));
+    } else if (@sizeOf(usize) == 8) {
+        return noasync in_stream.readIntLittle(u64);
+    } else {
+        unreachable;
+    }
 }
 
+// TODO the noasyncs here are workarounds
 fn parseFormValueRef(allocator: *mem.Allocator, in_stream: var, size: i32) !FormValue {
     return FormValue{
         .Ref = switch (size) {
-            1 => try in_stream.readIntLittle(u8),
-            2 => try in_stream.readIntLittle(u16),
-            4 => try in_stream.readIntLittle(u32),
-            8 => try in_stream.readIntLittle(u64),
-            -1 => try leb.readULEB128(u64, in_stream),
+            1 => try noasync in_stream.readIntLittle(u8),
+            2 => try noasync in_stream.readIntLittle(u16),
+            4 => try noasync in_stream.readIntLittle(u32),
+            8 => try noasync in_stream.readIntLittle(u64),
+            -1 => try noasync leb.readULEB128(u64, in_stream),
             else => unreachable,
         },
     };
 }
 
+// TODO the noasyncs here are workarounds
 fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64: bool) anyerror!FormValue {
     return switch (form_id) {
         DW.FORM_addr => FormValue{ .Address = try parseFormValueTargetAddrSize(in_stream) },
@@ -1565,7 +1578,7 @@ fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64
         DW.FORM_block2 => parseFormValueBlock(allocator, in_stream, 2),
         DW.FORM_block4 => parseFormValueBlock(allocator, in_stream, 4),
         DW.FORM_block => x: {
-            const block_len = try leb.readULEB128(usize, in_stream);
+            const block_len = try noasync leb.readULEB128(usize, in_stream);
             return parseFormValueBlockLen(allocator, in_stream, block_len);
         },
         DW.FORM_data1 => parseFormValueConstant(allocator, in_stream, false, 1),
@@ -1577,11 +1590,11 @@ fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64
             return parseFormValueConstant(allocator, in_stream, signed, -1);
         },
         DW.FORM_exprloc => {
-            const size = try leb.readULEB128(usize, in_stream);
+            const size = try noasync leb.readULEB128(usize, in_stream);
             const buf = try readAllocBytes(allocator, in_stream, size);
             return FormValue{ .ExprLoc = buf };
         },
-        DW.FORM_flag => FormValue{ .Flag = (try in_stream.readByte()) != 0 },
+        DW.FORM_flag => FormValue{ .Flag = (try noasync in_stream.readByte()) != 0 },
         DW.FORM_flag_present => FormValue{ .Flag = true },
         DW.FORM_sec_offset => FormValue{ .SecOffset = try parseFormValueDwarfOffsetSize(in_stream, is_64) },
 
@@ -1592,12 +1605,12 @@ fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64
         DW.FORM_ref_udata => parseFormValueRef(allocator, in_stream, -1),
 
         DW.FORM_ref_addr => FormValue{ .RefAddr = try parseFormValueDwarfOffsetSize(in_stream, is_64) },
-        DW.FORM_ref_sig8 => FormValue{ .Ref = try in_stream.readIntLittle(u64) },
+        DW.FORM_ref_sig8 => FormValue{ .Ref = try noasync in_stream.readIntLittle(u64) },
 
         DW.FORM_string => FormValue{ .String = try readStringRaw(allocator, in_stream) },
         DW.FORM_strp => FormValue{ .StrPtr = try parseFormValueDwarfOffsetSize(in_stream, is_64) },
         DW.FORM_indirect => {
-            const child_form_id = try leb.readULEB128(u64, in_stream);
+            const child_form_id = try noasync leb.readULEB128(u64, in_stream);
             const F = @typeOf(async parseFormValue(allocator, in_stream, child_form_id, is_64));
             var frame = try allocator.create(F);
             defer allocator.destroy(frame);
@@ -2400,3 +2413,9 @@ stdcallcc fn handleSegfaultWindows(info: *windows.EXCEPTION_POINTERS) c_long {
         else => return windows.EXCEPTION_CONTINUE_SEARCH,
     }
 }
+
+pub fn dumpStackPointerAddr(prefix: []const u8) void {
+    const sp = asm ("" : [argc] "={rsp}" (-> usize));
+    std.debug.warn("{} sp = 0x{x}\n", prefix, sp);
+}
+
std/thread.zig
@@ -145,7 +145,7 @@ pub const Thread = struct {
         if (builtin.single_threaded) @compileError("cannot spawn thread when building in single-threaded mode");
         // TODO compile-time call graph analysis to determine stack upper bound
         // https://github.com/ziglang/zig/issues/157
-        const default_stack_size = 8 * 1024 * 1024;
+        const default_stack_size = 16 * 1024 * 1024;
 
         const Context = @typeOf(context);
         comptime assert(@ArgType(@typeOf(startFn), 0) == Context);