Commit d788b0cd8b

LemonBoy <thatlemon@gmail.com>
2020-03-25 12:08:50
std: Minor changes to TLS handling
* Always allocate an info block per-thread so that libc can store important stuff there. * Respect ABI-mandated alignment in more places. * Nicer code, use slices/pointers instead of raw addresses whenever possible.
1 parent 12e1c6e
Changed files (4)
lib/std/os/linux/tls.zig
@@ -1,8 +1,9 @@
 const std = @import("std");
+const builtin = std.builtin;
 const os = std.os;
 const mem = std.mem;
 const elf = std.elf;
-const builtin = @import("builtin");
+const math = std.math;
 const assert = std.debug.assert;
 
 // This file implements the two TLS variants [1] used by ELF-based systems.
@@ -57,28 +58,16 @@ const tls_tcb_size = switch (builtin.arch) {
     // ARM EABI mandates enough space for two pointers: the first one points to
     // the DTV while the second one is unspecified but reserved
     .arm, .armeb, .aarch64, .aarch64_be => 2 * @sizeOf(usize),
+    // One pointer-sized word that points either to the DTV or the TCB itself
     else => @sizeOf(usize),
 };
 
-// Controls if the TCB should be aligned according to the TLS segment p_align
-const tls_tcb_align_size = switch (builtin.arch) {
-    .arm, .armeb, .aarch64, .aarch64_be => true,
-    else => false,
-};
-
 // Controls if the TP points to the end of the TCB instead of its beginning
 const tls_tp_points_past_tcb = switch (builtin.arch) {
     .riscv32, .riscv64, .mipsel, .powerpc64, .powerpc64le => true,
     else => false,
 };
 
-// Check if the architecture-specific parameters look correct
-comptime {
-    if (tls_tcb_align_size and tls_variant != TLSVariant.VariantI) {
-        @compileError("tls_tcb_align_size is only meaningful for variant I TLS");
-    }
-}
-
 // Some architectures add some offset to the tp and dtv addresses in order to
 // make the generated code more efficient
 
@@ -94,32 +83,36 @@ const tls_dtv_offset = switch (builtin.arch) {
 };
 
 // Per-thread storage for Zig's use
-const CustomData = packed struct {};
+const CustomData = struct {
+    dummy: usize,
+};
 
 // Dynamic Thread Vector
-const DTV = packed struct {
+const DTV = extern struct {
     entries: usize,
-    tls_block: [1]usize,
+    tls_block: [1][*]u8,
 };
 
 // Holds all the information about the process TLS image
 const TLSImage = struct {
-    data_src: []u8,
+    init_data: []const u8,
     alloc_size: usize,
+    alloc_align: usize,
     tcb_offset: usize,
     dtv_offset: usize,
     data_offset: usize,
+    data_size: usize,
     // Only used on the i386 architecture
     gdt_entry_number: usize,
 };
 
-pub var tls_image: ?TLSImage = null;
+pub var tls_image: TLSImage = undefined;
 
 pub fn setThreadPointer(addr: usize) void {
     switch (builtin.arch) {
         .i386 => {
             var user_desc = std.os.linux.user_desc{
-                .entry_number = tls_image.?.gdt_entry_number,
+                .entry_number = tls_image.gdt_entry_number,
                 .base_addr = addr,
                 .limit = 0xfffff,
                 .seg_32bit = 1,
@@ -134,7 +127,7 @@ pub fn setThreadPointer(addr: usize) void {
 
             const gdt_entry_number = user_desc.entry_number;
             // We have to keep track of our slot as it's also needed for clone()
-            tls_image.?.gdt_entry_number = gdt_entry_number;
+            tls_image.gdt_entry_number = gdt_entry_number;
             // Update the %gs selector
             asm volatile ("movl %[gs_val], %%gs"
                 :
@@ -171,7 +164,7 @@ pub fn setThreadPointer(addr: usize) void {
     }
 }
 
-pub fn initTLS() ?*elf.Phdr {
+fn initTLS() void {
     var tls_phdr: ?*elf.Phdr = null;
     var img_base: usize = 0;
 
@@ -195,124 +188,149 @@ pub fn initTLS() ?*elf.Phdr {
     // Sanity check
     assert(at_phent == @sizeOf(elf.Phdr));
 
-    // Search the TLS section
+    // Find the TLS section
     const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum];
 
-    var gnu_stack: ?*elf.Phdr = null;
-
     for (phdrs) |*phdr| {
         switch (phdr.p_type) {
             elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr,
             elf.PT_TLS => tls_phdr = phdr,
-            elf.PT_GNU_STACK => gnu_stack = phdr,
-            else => continue,
+            else => {},
         }
     }
 
-    if (tls_phdr) |phdr| {
-        // If the cpu is arm-based, check if it supports the TLS register
-        if (builtin.arch == .arm and at_hwcap & std.os.linux.HWCAP_TLS == 0) {
-            // If the CPU does not support TLS via a coprocessor register,
-            // a kernel helper function can be used instead on certain linux kernels.
-            // See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c.
-            @panic("TODO: Implement ARM fallback TLS functionality");
-        }
+    // If the cpu is ARM-based, check if it supports the TLS register
+    if (comptime builtin.arch.isARM() and at_hwcap & std.os.linux.HWCAP_TLS == 0) {
+        // If the CPU does not support TLS via a coprocessor register,
+        // a kernel helper function can be used instead on certain linux kernels.
+        // See linux/arch/arm/include/asm/tls.h and musl/src/thread/arm/__set_thread_area.c.
+        @panic("TODO: Implement ARM fallback TLS functionality");
+    }
 
-        // Offsets into the allocated TLS area
-        var tcb_offset: usize = undefined;
-        var dtv_offset: usize = undefined;
-        var data_offset: usize = undefined;
-        var thread_data_offset: usize = undefined;
-        // Compute the total size of the ABI-specific data plus our own control
-        // structures
-        const alloc_size = switch (tls_variant) {
-            .VariantI => blk: {
-                var l: usize = 0;
-                dtv_offset = l;
-                l += @sizeOf(DTV);
-                thread_data_offset = l;
-                l += @sizeOf(CustomData);
-                l = mem.alignForward(l, phdr.p_align);
-                tcb_offset = l;
-                if (tls_tcb_align_size) {
-                    l += mem.alignForward(tls_tcb_size, phdr.p_align);
-                } else {
-                    l += tls_tcb_size;
-                }
-                data_offset = l;
-                l += phdr.p_memsz;
-                break :blk l;
-            },
-            .VariantII => blk: {
-                var l: usize = 0;
-                data_offset = l;
-                l += phdr.p_memsz;
-                l = mem.alignForward(l, phdr.p_align);
-                tcb_offset = l;
-                l += tls_tcb_size;
-                thread_data_offset = l;
-                l += @sizeOf(CustomData);
-                dtv_offset = l;
-                l += @sizeOf(DTV);
-                break :blk l;
-            },
-        };
-
-        tls_image = TLSImage{
-            .data_src = @intToPtr([*]u8, phdr.p_vaddr + img_base)[0..phdr.p_filesz],
-            .alloc_size = alloc_size,
-            .tcb_offset = tcb_offset,
-            .dtv_offset = dtv_offset,
-            .data_offset = data_offset,
-            .gdt_entry_number = @bitCast(usize, @as(isize, -1)),
-        };
+    var tls_align_factor: usize = undefined;
+    var tls_data: []const u8 = undefined;
+    var tls_data_alloc_size: usize = undefined;
+    if (tls_phdr) |phdr| {
+        // The effective size in memory is represented by p_memsz, the length of
+        // the data stored in the PT_TLS segment is p_filesz and may be less
+        // than the former
+        tls_align_factor = phdr.p_align;
+        tls_data = @intToPtr([*]u8, img_base + phdr.p_vaddr)[0..phdr.p_filesz];
+        tls_data_alloc_size = phdr.p_memsz;
+    } else {
+        tls_align_factor = @alignOf(*usize);
+        tls_data = &[_]u8{};
+        tls_data_alloc_size = 0;
     }
 
-    return gnu_stack;
+    // Offsets into the allocated TLS area
+    var tcb_offset: usize = undefined;
+    var dtv_offset: usize = undefined;
+    var data_offset: usize = undefined;
+    // Compute the total size of the ABI-specific data plus our own control
+    // structures. All the offset calculated here assume a well-aligned base
+    // address.
+    const alloc_size = switch (tls_variant) {
+        .VariantI => blk: {
+            var l: usize = 0;
+            dtv_offset = l;
+            l += @sizeOf(DTV);
+            // Add some padding here so that the thread pointer (tcb_offset) is
+            // aligned to p_align and the CustomData structure can be found by
+            // simply subtracting its @sizeOf from the tp value
+            const delta = (l + @sizeOf(CustomData)) & (tls_align_factor - 1);
+            if (delta > 0)
+                l += tls_align_factor - delta;
+            l += @sizeOf(CustomData);
+            tcb_offset = l;
+            l += mem.alignForward(tls_tcb_size, tls_align_factor);
+            data_offset = l;
+            l += tls_data_alloc_size;
+            break :blk l;
+        },
+        .VariantII => blk: {
+            var l: usize = 0;
+            data_offset = l;
+            l += mem.alignForward(tls_data_alloc_size, tls_align_factor);
+            // The thread pointer is aligned to p_align
+            tcb_offset = l;
+            l += tls_tcb_size;
+            // The CustomData structure is right after the TCB with no padding
+            // in between so it can be easily found
+            l += @sizeOf(CustomData);
+            l = mem.alignForward(l, @alignOf(DTV));
+            dtv_offset = l;
+            l += @sizeOf(DTV);
+            break :blk l;
+        },
+    };
+
+    tls_image = TLSImage{
+        .init_data = tls_data,
+        .alloc_size = alloc_size,
+        .alloc_align = tls_align_factor,
+        .tcb_offset = tcb_offset,
+        .dtv_offset = dtv_offset,
+        .data_offset = data_offset,
+        .data_size = tls_data_alloc_size,
+        .gdt_entry_number = @bitCast(usize, @as(isize, -1)),
+    };
 }
 
-pub fn copyTLS(addr: usize) usize {
-    const tls_img = tls_image.?;
+inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
+    return @ptrCast(*T, @alignCast(@alignOf(*T), ptr));
+}
 
-    // Be paranoid, clear the area we're going to use
-    @memset(@intToPtr([*]u8, addr), 0, tls_img.alloc_size);
+/// Initializes all the fields of the static TLS area and returns the computed
+/// architecture-specific value of the thread-pointer register
+pub fn prepareTLS(area: []u8) usize {
+    // Clear the area we're going to use, just to be safe
+    mem.set(u8, area, 0);
     // Prepare the DTV
-    const dtv = @intToPtr(*DTV, addr + tls_img.dtv_offset);
+    const dtv = alignPtrCast(DTV, area.ptr + tls_image.dtv_offset);
     dtv.entries = 1;
-    dtv.tls_block[0] = addr + tls_img.data_offset + tls_dtv_offset;
-    // Set-up the TCB
-    // Force the alignment to 1 byte as the TCB may start from a non-aligned
-    // address under the variant II model
-    const tcb_ptr = @intToPtr(*align(1) usize, addr + tls_img.tcb_offset);
-    if (tls_variant == TLSVariant.VariantI) {
-        tcb_ptr.* = addr + tls_img.dtv_offset;
-    } else {
-        tcb_ptr.* = addr + tls_img.tcb_offset;
-    }
+    dtv.tls_block[0] = area.ptr + tls_dtv_offset + tls_image.data_offset;
+    // Prepare the TCB
+    const tcb_ptr = alignPtrCast([*]u8, area.ptr + tls_image.tcb_offset);
+    tcb_ptr.* = switch (tls_variant) {
+        .VariantI => area.ptr + tls_image.dtv_offset,
+        .VariantII => area.ptr + tls_image.tcb_offset,
+    };
     // Copy the data
-    @memcpy(@intToPtr([*]u8, addr + tls_img.data_offset), tls_img.data_src.ptr, tls_img.data_src.len);
+    mem.copy(u8, area[tls_image.data_offset..], tls_image.init_data);
 
     // Return the corrected (if needed) value for the tp register
-    return addr + tls_tp_offset +
-        if (tls_tp_points_past_tcb) tls_img.data_offset else tls_img.tcb_offset;
+    return @ptrToInt(area.ptr) + tls_tp_offset +
+        if (tls_tp_points_past_tcb) tls_image.data_offset else tls_image.tcb_offset;
 }
 
-var main_thread_tls_buffer: [256]u8 align(32) = undefined;
+var main_thread_tls_buffer: [256]u8 = undefined;
 
-pub fn allocateTLS(size: usize) usize {
-    // Small TLS allocation, use our local buffer
-    if (size < main_thread_tls_buffer.len) {
-        return @ptrToInt(&main_thread_tls_buffer);
-    }
+pub fn initStaticTLS() void {
+    initTLS();
+
+    const alloc_tls_area: []u8 = blk: {
+        const full_alloc_size = tls_image.alloc_size + tls_image.alloc_align - 1;
+
+        // Fast path for the common case where the TLS data is really small,
+        // avoid an allocation and use our local buffer
+        if (full_alloc_size < main_thread_tls_buffer.len)
+            break :blk main_thread_tls_buffer[0..];
+
+        break :blk os.mmap(
+            null,
+            full_alloc_size,
+            os.PROT_READ | os.PROT_WRITE,
+            os.MAP_PRIVATE | os.MAP_ANONYMOUS,
+            -1,
+            0,
+        ) catch os.abort();
+    };
 
-    const slice = os.mmap(
-        null,
-        size,
-        os.PROT_READ | os.PROT_WRITE,
-        os.MAP_PRIVATE | os.MAP_ANONYMOUS,
-        -1,
-        0,
-    ) catch @panic("out of memory");
+    // Make sure the slice is correctly aligned
+    const start = @ptrToInt(alloc_tls_area.ptr) & (tls_image.alloc_align - 1);
+    const tls_area = alloc_tls_area[start .. start + tls_image.alloc_size];
 
-    return @ptrToInt(slice.ptr);
+    const tp_value = prepareTLS(tls_area);
+    setThreadPointer(tp_value);
 }
lib/std/start.zig
@@ -152,13 +152,7 @@ fn posixCallMainAndExit() noreturn {
         const auxv = @ptrCast([*]std.elf.Auxv, @alignCast(@alignOf(usize), envp.ptr + envp_count + 1));
         std.os.linux.elf_aux_maybe = auxv;
         // Initialize the TLS area
-        const gnu_stack_phdr = std.os.linux.tls.initTLS() orelse @panic("ELF missing stack size");
-
-        if (std.os.linux.tls.tls_image) |tls_img| {
-            const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size);
-            const tp = std.os.linux.tls.copyTLS(tls_addr);
-            std.os.linux.tls.setThreadPointer(tp);
-        }
+        std.os.linux.tls.initStaticTLS();
 
         // TODO This is disabled because what should we do when linking libc and this code
         // does not execute? And also it's causing a test failure in stack traces in release modes.
lib/std/thread.zig
@@ -286,11 +286,9 @@ pub const Thread = struct {
             }
             // Finally, the Thread Local Storage, if any.
             if (!Thread.use_pthreads) {
-                if (os.linux.tls.tls_image) |tls_img| {
-                    l = mem.alignForward(l, @alignOf(usize));
-                    tls_start_offset = l;
-                    l += tls_img.alloc_size;
-                }
+                l = mem.alignForward(l, os.linux.tls.tls_image.alloc_align);
+                tls_start_offset = l;
+                l += os.linux.tls.tls_image.alloc_size;
             }
             // Round the size to the page size.
             break :blk mem.alignForward(l, mem.page_size);
@@ -396,18 +394,21 @@ pub const Thread = struct {
                 else => return os.unexpectedErrno(@intCast(usize, err)),
             }
         } else if (std.Target.current.os.tag == .linux) {
-            var flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES | os.CLONE_SIGHAND |
-                os.CLONE_THREAD | os.CLONE_SYSVSEM | os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID |
-                os.CLONE_DETACHED;
-            var newtls: usize = undefined;
+            const flags: u32 = os.CLONE_VM | os.CLONE_FS | os.CLONE_FILES |
+                os.CLONE_SIGHAND | os.CLONE_THREAD | os.CLONE_SYSVSEM |
+                os.CLONE_PARENT_SETTID | os.CLONE_CHILD_CLEARTID |
+                os.CLONE_DETACHED | os.CLONE_SETTLS;
             // This structure is only needed when targeting i386
             var user_desc: if (std.Target.current.cpu.arch == .i386) os.linux.user_desc else void = undefined;
 
-            if (os.linux.tls.tls_image) |tls_img| {
+            const tls_area = mmap_slice[tls_start_offset..];
+            const tp_value = os.linux.tls.prepareTLS(tls_area);
+
+            const newtls = blk: {
                 if (std.Target.current.cpu.arch == .i386) {
                     user_desc = os.linux.user_desc{
-                        .entry_number = tls_img.gdt_entry_number,
-                        .base_addr = os.linux.tls.copyTLS(mmap_addr + tls_start_offset),
+                        .entry_number = os.linux.tls.tls_image.gdt_entry_number,
+                        .base_addr = tp_value,
                         .limit = 0xfffff,
                         .seg_32bit = 1,
                         .contents = 0, // Data
@@ -416,12 +417,11 @@ pub const Thread = struct {
                         .seg_not_present = 0,
                         .useable = 1,
                     };
-                    newtls = @ptrToInt(&user_desc);
+                    break :blk @ptrToInt(&user_desc);
                 } else {
-                    newtls = os.linux.tls.copyTLS(mmap_addr + tls_start_offset);
+                    break :blk tp_value;
                 }
-                flags |= os.CLONE_SETTLS;
-            }
+            };
 
             const rc = os.linux.clone(
                 MainFuncs.linuxThreadMain,
test/stack_traces.zig
@@ -282,7 +282,7 @@ pub fn addCases(cases: *tests.StackTracesContext) void {
                     \\source.zig:10:8: [address] in main (test)
                     \\    foo();
                     \\       ^
-                    \\start.zig:256:29: [address] in std.start.posixCallMainAndExit (test)
+                    \\start.zig:250:29: [address] in std.start.posixCallMainAndExit (test)
                     \\            return root.main();
                     \\                            ^
                     \\start.zig:123:5: [address] in std.start._start (test)