Commit d8ab301aa8

LemonBoy <thatlemon@gmail.com>
2019-05-04 12:02:55
std: Implement TLS support for Linux
Tested on x86_64, i386, ARM, AARCH64
1 parent 7432fb0
Changed files (5)
std/os/linux/tls.zig
@@ -0,0 +1,242 @@
+const std = @import("std");
+const mem = std.mem;
+const posix = std.posix;
+const elf = std.elf;
+const builtin = @import("builtin");
+const assert = std.debug.assert;
+
+// This file implements the two TLS variants [1] used by ELF-based systems.
+//
+// The variant I has the following layout in memory:
+// -------------------------------------------------------
+// |   DTV   |     Zig     |   DTV   | Alignment |  TLS  |
+// | storage | thread data | pointer |           | block |
+// ------------------------^------------------------------
+//                         `-- The thread pointer register points here
+//
+// In this case we allocate additional space for our control structure that's
+// placed _before_ the DTV pointer together with the DTV.
+//
+// NOTE: Some systems such as power64 or mips use this variant with a twist: the
+// alignment is not present and the tp and DTV addresses are offset by a
+// constant.
+//
+// On the other hand the variant II has the following layout in memory:
+// ---------------------------------------
+// |  TLS  | TCB |     Zig     |   DTV   |
+// | block |     | thread data | storage |
+// --------^------------------------------
+//         `-- The thread pointer register points here
+//
+// The structure of the TCB is not defined by the ABI so we reserve enough space
+// for a single pointer as some architectures such as i386 and x86_64 need a
+// pointer to the TCB block itself at the address pointed by the tp.
+//
+// In this case the control structure and DTV are placed one after another right
+// after the TLS block data.
+//
+// At the moment the DTV is very simple since we only support static TLS, all we
+// need is a two word vector to hold the number of entries (1) and the address
+// of the first TLS block.
+//
+// [1] https://www.akkadia.org/drepper/tls.pdf
+
+const TLSVariant = enum {
+    VariantI,
+    VariantII,
+};
+
+const tls_variant = switch (builtin.arch) {
+    .arm, .armeb, .aarch64, .aarch64_be => TLSVariant.VariantI,
+    .x86_64, .i386 => TLSVariant.VariantII,
+    else => @compileError("undefined tls_variant for this architecture"),
+};
+
+// Controls how many bytes are reserved for the Thread Control Block
+const tls_tcb_size = switch (builtin.arch) {
+    // ARM EABI mandates enough space for two pointers: the first one points to
+    // the DTV while the second one is unspecified but reserved
+    .arm, .armeb, .aarch64, .aarch64_be => 2 * @sizeOf(usize),
+    .i386, .x86_64 => @sizeOf(usize),
+    else => 0,
+};
+
+// Controls if the TCB should be aligned according to the TLS segment p_align
+const tls_tcb_align_size = switch (builtin.arch) {
+    .arm, .armeb, .aarch64, .aarch64_be => true,
+    else => false,
+};
+
+// Check if the architecture-specific parameters look correct
+comptime {
+    if (tls_tcb_align_size and tls_variant != TLSVariant.VariantI) {
+        @compileError("tls_tcb_align_size is only meaningful for variant I TLS");
+    }
+}
+
+// Some architectures add some offset to the tp and dtv addresses in order to
+// make the generated code more efficient
+
+const tls_tp_offset = switch (builtin.arch) {
+    else => 0,
+};
+
+const tls_dtv_offset = switch (builtin.arch) {
+    else => 0,
+};
+
+// Per-thread storage for Zig's use
+const CustomData = packed struct {
+};
+
+// Dynamic Thread Vector
+const DTV = packed struct {
+    entries: usize,
+    tls_block: [1]usize,
+};
+
+// Holds all the information about the process TLS image
+const TLSImage = struct {
+    data_src: []u8,
+    alloc_size: usize,
+    tcb_offset: usize,
+    dtv_offset: usize,
+    data_offset: usize,
+};
+
+pub var tls_image: ?TLSImage = null;
+
+pub fn setThreadPointer(addr: usize) void {
+    switch (builtin.arch) {
+        .x86_64 => {
+            const ARCH_SET_FS = 0x1002;
+            const rc = std.os.linux.syscall2(std.os.linux.SYS_arch_prctl, ARCH_SET_FS, addr);
+            // arch_prctl is documented to never fail
+            assert(rc == 0);
+        },
+        .aarch64 => {
+            asm volatile (
+                \\ msr tpidr_el0, %[addr]
+                : : [addr] "r" (addr)
+            );
+        },
+        else => @compileError("Unsupported architecture"),
+    }
+}
+
+pub fn initTLS() void {
+    var tls_phdr: ?*elf.Phdr = null;
+    var img_base: usize = 0;
+
+    if (std.os.linux_elf_aux_maybe) |auxv| {
+        var at_phent: usize = undefined;
+        var at_phnum: usize = undefined;
+        var at_phdr: usize = undefined;
+
+        var i: usize = 0;
+        while (auxv[i].a_type != std.elf.AT_NULL) : (i += 1) {
+            switch (auxv[i].a_type) {
+                elf.AT_PHENT => at_phent = auxv[i].a_un.a_val,
+                elf.AT_PHNUM => at_phnum = auxv[i].a_un.a_val,
+                elf.AT_PHDR  => at_phdr  = auxv[i].a_un.a_val,
+                else => continue,
+            }
+        }
+
+        // Sanity check
+        assert(at_phent == @sizeOf(elf.Phdr));
+
+        // Search the TLS section
+        const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum];
+
+        for (phdrs) |*phdr| {
+            switch (phdr.p_type) {
+                elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr,
+                elf.PT_TLS  => tls_phdr = phdr,
+                else => continue,
+            }
+        }
+    } else {
+        @panic("no auxv vector available!");
+    }
+
+    if (tls_phdr) |phdr| {
+        // Offsets into the allocated TLS area
+        var tcb_offset: usize = undefined;
+        var dtv_offset: usize = undefined;
+        var data_offset: usize = undefined;
+        var thread_data_offset: usize = undefined;
+        // Compute the total size of the ABI-specific data plus our own control
+        // structures
+        const alloc_size = switch (tls_variant) {
+            .VariantI => blk: {
+                var l: usize = 0;
+                dtv_offset = l;
+                l += @sizeOf(DTV);
+                thread_data_offset = l;
+                l += @sizeOf(CustomData);
+                l = mem.alignForward(l, phdr.p_align);
+                tcb_offset = l;
+                if (tls_tcb_align_size) {
+                    l += mem.alignForward(tls_tcb_size, phdr.p_align);
+                } else {
+                    l += tls_tcb_size;
+                }
+                data_offset = l;
+                l += phdr.p_memsz;
+                break :blk l;
+            },
+            .VariantII => blk: {
+                var l: usize = 0;
+                data_offset = l;
+                l += phdr.p_memsz;
+                l = mem.alignForward(l, phdr.p_align);
+                tcb_offset = l;
+                l += tls_tcb_size;
+                thread_data_offset = l;
+                l += @sizeOf(CustomData);
+                dtv_offset = l;
+                l += @sizeOf(DTV);
+                break :blk l;
+            }
+        };
+
+        tls_image = TLSImage{
+            .data_src = @intToPtr([*]u8, phdr.p_vaddr + img_base)[0..phdr.p_filesz],
+            .alloc_size = alloc_size,
+            .tcb_offset = tcb_offset,
+            .dtv_offset = dtv_offset,
+            .data_offset = data_offset,
+        };
+    }
+}
+
+pub fn copyTLS(addr: usize) usize {
+    const tls_img = tls_image orelse @panic("copyTLS called with no TLS section!");
+
+    // Be paranoid, clear the area we're going to use
+    @memset(@intToPtr([*]u8, addr), 0, tls_img.alloc_size);
+    // Prepare the DTV
+    const dtv = @intToPtr(*DTV, addr + tls_img.dtv_offset);
+    dtv.entries = 1;
+    dtv.tls_block[0] = addr + tls_img.data_offset + tls_dtv_offset;
+    // Set-up the TCB
+    const tcb_ptr = @intToPtr(*usize, addr + tls_img.tcb_offset);
+    if (tls_variant == TLSVariant.VariantI) {
+        tcb_ptr.* = addr + tls_img.dtv_offset;
+    } else {
+        tcb_ptr.* = addr + tls_img.tcb_offset;
+    }
+    // Copy the data
+    @memcpy(@intToPtr([*]u8, addr + tls_img.data_offset), tls_img.data_src.ptr, tls_img.data_src.len);
+
+    // Return the corrected (if needed) value for the tp register
+    return addr + tls_img.tcb_offset + tls_tp_offset;
+}
+
+var main_thread_tls_buffer: [64]u8 align(32) = undefined;
+
+pub fn allocateTLS(size: usize) usize {
+    assert(size < main_thread_tls_buffer.len);
+    return @ptrToInt(&main_thread_tls_buffer);
+}
std/os/linux.zig
@@ -3,6 +3,7 @@ const assert = std.debug.assert;
 const builtin = @import("builtin");
 const maxInt = std.math.maxInt;
 const elf = std.elf;
+pub const tls = @import("linux/tls.zig");
 const vdso = @import("linux/vdso.zig");
 const dl = @import("../dynamic_library.zig");
 pub use switch (builtin.arch) {
std/special/bootstrap.zig
@@ -67,24 +67,19 @@ fn posixCallMainAndExit() noreturn {
     var envp_count: usize = 0;
     while (envp_optional[envp_count]) |_| : (envp_count += 1) {}
     const envp = @ptrCast([*][*]u8, envp_optional)[0..envp_count];
+
     if (builtin.os == builtin.Os.linux) {
-        // Scan auxiliary vector.
         const auxv = @ptrCast([*]std.elf.Auxv, envp.ptr + envp_count + 1);
         std.os.linux_elf_aux_maybe = auxv;
-        var i: usize = 0;
-        var at_phdr: usize = 0;
-        var at_phnum: usize = 0;
-        var at_phent: usize = 0;
-        while (auxv[i].a_un.a_val != 0) : (i += 1) {
-            switch (auxv[i].a_type) {
-                std.elf.AT_PAGESZ => assert(auxv[i].a_un.a_val == std.os.page_size),
-                std.elf.AT_PHDR => at_phdr = auxv[i].a_un.a_val,
-                std.elf.AT_PHNUM => at_phnum = auxv[i].a_un.a_val,
-                std.elf.AT_PHENT => at_phent = auxv[i].a_un.a_val,
-                else => {},
+
+        std.os.linux.tls.initTLS();
+        if (!builtin.single_threaded) {
+            if (std.os.linux.tls.tls_image) |tls_img| {
+                const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size);
+                const tp = std.os.linux.tls.copyTLS(tls_addr);
+                std.os.linux.tls.setThreadPointer(tp);
             }
         }
-        if (!builtin.single_threaded) linuxInitializeThreadLocalStorage(at_phdr, at_phnum, at_phent);
     }
 
     std.os.posix.exit(callMainWithArgs(argc, argv, envp));
@@ -140,50 +135,3 @@ inline fn callMain() u8 {
 
 const main_thread_tls_align = 32;
 var main_thread_tls_bytes: [64]u8 align(main_thread_tls_align) = [1]u8{0} ** 64;
-
-fn linuxInitializeThreadLocalStorage(at_phdr: usize, at_phnum: usize, at_phent: usize) void {
-    var phdr_addr = at_phdr;
-    var n = at_phnum;
-    var base: usize = 0;
-    while (n != 0) : ({
-        n -= 1;
-        phdr_addr += at_phent;
-    }) {
-        const phdr = @intToPtr(*std.elf.Phdr, phdr_addr);
-        // TODO look for PT_DYNAMIC when we have https://github.com/ziglang/zig/issues/1917
-        switch (phdr.p_type) {
-            std.elf.PT_PHDR => base = at_phdr - phdr.p_vaddr,
-            std.elf.PT_TLS => std.os.linux_tls_phdr = phdr,
-            else => continue,
-        }
-    }
-    const tls_phdr = std.os.linux_tls_phdr orelse return;
-    std.os.linux_tls_img_src = @intToPtr([*]const u8, base + tls_phdr.p_vaddr);
-    const end_addr = @ptrToInt(&main_thread_tls_bytes) + tls_phdr.p_memsz;
-    const max_end_addr = @ptrToInt(&main_thread_tls_bytes) + main_thread_tls_bytes.len;
-    assert(max_end_addr >= end_addr + @sizeOf(usize)); // not enough preallocated Thread Local Storage
-    assert(main_thread_tls_align >= tls_phdr.p_align); // preallocated Thread Local Storage not aligned enough
-    @memcpy(&main_thread_tls_bytes, std.os.linux_tls_img_src, tls_phdr.p_filesz);
-    const end_ptr = @intToPtr(*usize, end_addr);
-    end_ptr.* = end_addr;
-    linuxSetThreadArea(end_addr);
-}
-
-fn linuxSetThreadArea(addr: usize) void {
-    switch (builtin.arch) {
-        builtin.Arch.x86_64 => {
-            const ARCH_SET_FS = 0x1002;
-            const rc = std.os.linux.syscall2(std.os.linux.SYS_arch_prctl, ARCH_SET_FS, addr);
-            // acrh_prctl is documented to never fail
-            assert(rc == 0);
-        },
-        builtin.Arch.aarch64 => {
-            asm volatile (
-                \\        msr tpidr_el0,x0
-                \\        mov w0,#0
-                \\        ret
-            );
-        },
-        else => @compileError("Unsupported architecture"),
-    }
-}
std/os.zig
@@ -3126,9 +3126,6 @@ pub const SpawnThreadError = error{
     Unexpected,
 };
 
-pub var linux_tls_phdr: ?*std.elf.Phdr = null;
-pub var linux_tls_img_src: [*]const u8 = undefined; // defined if linux_tls_phdr is
-
 /// caller must call wait on the returned thread
 /// fn startFn(@typeOf(context)) T
 /// where T is u8, noreturn, void, or !void
@@ -3238,12 +3235,10 @@ pub fn spawnThread(context: var, comptime startFn: var) SpawnThreadError!*Thread
         }
         // Finally, the Thread Local Storage, if any.
         if (!Thread.use_pthreads) {
-            if (linux_tls_phdr) |tls_phdr| {
-                l = mem.alignForward(l, tls_phdr.p_align);
+            if (linux.tls.tls_image) |tls_img| {
+                l = mem.alignForward(l, @alignOf(usize));
                 tls_start_offset = l;
-                l += tls_phdr.p_memsz;
-                // the fs register address
-                l += @sizeOf(usize);
+                l += tls_img.alloc_size;
             }
         }
         break :blk l;
@@ -3284,10 +3279,8 @@ pub fn spawnThread(context: var, comptime startFn: var) SpawnThreadError!*Thread
             posix.CLONE_THREAD | posix.CLONE_SYSVSEM | posix.CLONE_PARENT_SETTID | posix.CLONE_CHILD_CLEARTID |
             posix.CLONE_DETACHED;
         var newtls: usize = undefined;
-        if (linux_tls_phdr) |tls_phdr| {
-            @memcpy(@intToPtr([*]u8, mmap_addr + tls_start_offset), linux_tls_img_src, tls_phdr.p_filesz);
-            newtls = mmap_addr + mmap_len - @sizeOf(usize);
-            @intToPtr(*usize, newtls).* = newtls;
+        if (linux.tls.tls_image) |tls_img| {
+            newtls = linux.tls.copyTLS(mmap_addr + tls_start_offset);
             flags |= posix.CLONE_SETTLS;
         }
         const rc = posix.clone(MainFuncs.linuxThreadMain, mmap_addr + stack_end_offset, flags, arg, &thread_ptr.data.handle, newtls, &thread_ptr.data.handle);
CMakeLists.txt
@@ -611,6 +611,7 @@ set(ZIG_STD_FILES
     "os/linux.zig"
     "os/linux/arm64.zig"
     "os/linux/errno.zig"
+    "os/linux/tls.zig"
     "os/linux/vdso.zig"
     "os/linux/x86_64.zig"
     "os/netbsd.zig"