Commit b1775ca168

Andrew Kelley <andrew@ziglang.org>
2019-02-06 19:48:04
thread local storage working for linux x86_64
1 parent 8c6fa98
src/all_types.hpp
@@ -544,12 +544,7 @@ struct AstNodeDefer {
 };
 
 struct AstNodeVariableDeclaration {
-    VisibMod visib_mod;
     Buf *symbol;
-    bool is_const;
-    bool is_comptime;
-    bool is_export;
-    bool is_extern;
     // one or both of type and expr will be non null
     AstNode *type;
     AstNode *expr;
@@ -559,6 +554,13 @@ struct AstNodeVariableDeclaration {
     AstNode *align_expr;
     // populated if the "section(S)" is present
     AstNode *section_expr;
+    Token *threadlocal_tok;
+
+    VisibMod visib_mod;
+    bool is_const;
+    bool is_comptime;
+    bool is_export;
+    bool is_extern;
 };
 
 struct AstNodeTestDecl {
@@ -1873,6 +1875,7 @@ struct ZigVar {
     bool shadowable;
     bool src_is_const;
     bool gen_is_const;
+    bool is_thread_local;
 };
 
 struct ErrorTableEntry {
src/analyze.cpp
@@ -28,28 +28,10 @@ static Error ATTRIBUTE_MUST_USE resolve_enum_zero_bits(CodeGen *g, ZigType *enum
 static Error ATTRIBUTE_MUST_USE resolve_union_zero_bits(CodeGen *g, ZigType *union_type);
 static void analyze_fn_body(CodeGen *g, ZigFn *fn_table_entry);
 
-ErrorMsg *add_node_error(CodeGen *g, AstNode *node, Buf *msg) {
-    if (node->owner->c_import_node != nullptr) {
-        // if this happens, then translate_c generated code that
-        // failed semantic analysis, which isn't supposed to happen
-        ErrorMsg *err = add_node_error(g, node->owner->c_import_node,
-            buf_sprintf("compiler bug: @cImport generated invalid zig code"));
-
-        add_error_note(g, err, node, msg);
-
-        g->errors.append(err);
-        return err;
-    }
-
-    ErrorMsg *err = err_msg_create_with_line(node->owner->path, node->line, node->column,
-            node->owner->source_code, node->owner->line_offsets, msg);
-
-    g->errors.append(err);
-    return err;
-}
-
-ErrorMsg *add_error_note(CodeGen *g, ErrorMsg *parent_msg, AstNode *node, Buf *msg) {
-    if (node->owner->c_import_node != nullptr) {
+static ErrorMsg *add_error_note_token(CodeGen *g, ErrorMsg *parent_msg, ImportTableEntry *owner, Token *token,
+        Buf *msg)
+{
+    if (owner->c_import_node != nullptr) {
         // if this happens, then translate_c generated code that
         // failed semantic analysis, which isn't supposed to happen
 
@@ -64,13 +46,46 @@ ErrorMsg *add_error_note(CodeGen *g, ErrorMsg *parent_msg, AstNode *node, Buf *m
         return note;
     }
 
-    ErrorMsg *err = err_msg_create_with_line(node->owner->path, node->line, node->column,
-            node->owner->source_code, node->owner->line_offsets, msg);
+    ErrorMsg *err = err_msg_create_with_line(owner->path, token->start_line, token->start_column,
+            owner->source_code, owner->line_offsets, msg);
 
     err_msg_add_note(parent_msg, err);
     return err;
 }
 
+ErrorMsg *add_token_error(CodeGen *g, ImportTableEntry *owner, Token *token, Buf *msg) {
+    if (owner->c_import_node != nullptr) {
+        // if this happens, then translate_c generated code that
+        // failed semantic analysis, which isn't supposed to happen
+        ErrorMsg *err = add_node_error(g, owner->c_import_node,
+            buf_sprintf("compiler bug: @cImport generated invalid zig code"));
+
+        add_error_note_token(g, err, owner, token, msg);
+
+        g->errors.append(err);
+        return err;
+    }
+    ErrorMsg *err = err_msg_create_with_line(owner->path, token->start_line, token->start_column,
+            owner->source_code, owner->line_offsets, msg);
+
+    g->errors.append(err);
+    return err;
+}
+
+ErrorMsg *add_node_error(CodeGen *g, AstNode *node, Buf *msg) {
+    Token fake_token;
+    fake_token.start_line = node->line;
+    fake_token.start_column = node->column;
+    return add_token_error(g, node->owner, &fake_token, msg);
+}
+
+ErrorMsg *add_error_note(CodeGen *g, ErrorMsg *parent_msg, AstNode *node, Buf *msg) {
+    Token fake_token;
+    fake_token.start_line = node->line;
+    fake_token.start_column = node->column;
+    return add_error_note_token(g, parent_msg, node->owner, &fake_token, msg);
+}
+
 ZigType *new_type_table_entry(ZigTypeId id) {
     ZigType *entry = allocate<ZigType>(1);
     entry->id = id;
@@ -3668,6 +3683,7 @@ static void resolve_decl_var(CodeGen *g, TldVar *tld_var) {
     bool is_const = var_decl->is_const;
     bool is_extern = var_decl->is_extern;
     bool is_export = var_decl->is_export;
+    bool is_thread_local = var_decl->threadlocal_tok != nullptr;
 
     ZigType *explicit_type = nullptr;
     if (var_decl->type) {
@@ -3727,6 +3743,7 @@ static void resolve_decl_var(CodeGen *g, TldVar *tld_var) {
     tld_var->var = add_variable(g, source_node, tld_var->base.parent_scope, var_decl->symbol,
             is_const, init_val, &tld_var->base, type);
     tld_var->var->linkage = linkage;
+    tld_var->var->is_thread_local = is_thread_local;
 
     if (implicit_type != nullptr && type_is_invalid(implicit_type)) {
         tld_var->var->var_type = g->builtin_types.entry_invalid;
@@ -3747,6 +3764,10 @@ static void resolve_decl_var(CodeGen *g, TldVar *tld_var) {
         }
     }
 
+    if (is_thread_local && is_const) {
+        add_node_error(g, source_node, buf_sprintf("threadlocal variable cannot be constant"));
+    }
+
     g->global_vars.append(tld_var);
 }
 
src/analyze.hpp
@@ -12,6 +12,7 @@
 
 void semantic_analyze(CodeGen *g);
 ErrorMsg *add_node_error(CodeGen *g, AstNode *node, Buf *msg);
+ErrorMsg *add_token_error(CodeGen *g, ImportTableEntry *owner, Token *token, Buf *msg);
 ErrorMsg *add_error_note(CodeGen *g, ErrorMsg *parent_msg, AstNode *node, Buf *msg);
 ZigType *new_type_table_entry(ZigTypeId id);
 ZigType *get_pointer_to_type(CodeGen *g, ZigType *child_type, bool is_const);
src/ast_render.cpp
@@ -132,6 +132,10 @@ static const char *const_or_var_string(bool is_const) {
     return is_const ? "const" : "var";
 }
 
+static const char *thread_local_string(Token *tok) {
+    return (tok == nullptr) ? "" : "threadlocal ";
+}
+
 const char *container_string(ContainerKind kind) {
     switch (kind) {
         case ContainerKindEnum: return "enum";
@@ -554,8 +558,9 @@ static void render_node_extra(AstRender *ar, AstNode *node, bool grouped) {
             {
                 const char *pub_str = visib_mod_string(node->data.variable_declaration.visib_mod);
                 const char *extern_str = extern_string(node->data.variable_declaration.is_extern);
+                const char *thread_local_str = thread_local_string(node->data.variable_declaration.threadlocal_tok);
                 const char *const_or_var = const_or_var_string(node->data.variable_declaration.is_const);
-                fprintf(ar->f, "%s%s%s ", pub_str, extern_str, const_or_var);
+                fprintf(ar->f, "%s%s%s%s ", pub_str, extern_str, thread_local_str, const_or_var);
                 print_symbol(ar, node->data.variable_declaration.symbol);
 
                 if (node->data.variable_declaration.type) {
src/codegen.cpp
@@ -6445,6 +6445,9 @@ static void do_code_gen(CodeGen *g) {
                 maybe_import_dll(g, global_value, GlobalLinkageIdStrong);
                 LLVMSetAlignment(global_value, var->align_bytes);
                 LLVMSetGlobalConstant(global_value, var->gen_is_const);
+                if (var->is_thread_local && !g->is_single_threaded) {
+                    LLVMSetThreadLocalMode(global_value, LLVMGeneralDynamicTLSModel);
+                }
             }
         } else {
             bool exported = (var->linkage == VarLinkageExport);
@@ -6470,6 +6473,9 @@ static void do_code_gen(CodeGen *g) {
             }
 
             LLVMSetGlobalConstant(global_value, var->gen_is_const);
+            if (var->is_thread_local && !g->is_single_threaded) {
+                LLVMSetThreadLocalMode(global_value, LLVMGeneralDynamicTLSModel);
+            }
         }
 
         var->value_ref = global_value;
@@ -7520,6 +7526,7 @@ static Error define_builtin_compile_vars(CodeGen *g) {
     g->compile_var_package = new_package(buf_ptr(this_dir), builtin_zig_basename);
     g->root_package->package_table.put(buf_create_from_str("builtin"), g->compile_var_package);
     g->std_package->package_table.put(buf_create_from_str("builtin"), g->compile_var_package);
+    g->std_package->package_table.put(buf_create_from_str("std"), g->std_package);
     g->compile_var_import = add_source_file(g, g->compile_var_package, builtin_zig_path, contents);
     scan_import(g, g->compile_var_import);
 
src/ir.cpp
@@ -5204,6 +5204,10 @@ static IrInstruction *ir_gen_var_decl(IrBuilder *irb, Scope *scope, AstNode *nod
         add_node_error(irb->codegen, variable_declaration->section_expr,
             buf_sprintf("cannot set section of local variable '%s'", buf_ptr(variable_declaration->symbol)));
     }
+    if (variable_declaration->threadlocal_tok != nullptr) {
+        add_token_error(irb->codegen, node->owner, variable_declaration->threadlocal_tok,
+            buf_sprintf("function-local variable '%s' cannot be threadlocal", buf_ptr(variable_declaration->symbol)));
+    }
 
     // Temporarily set the name of the IrExecutable to the VariableDeclaration
     // so that the struct or enum from the init expression inherits the name.
src/parser.cpp
@@ -844,12 +844,17 @@ static AstNode *ast_parse_fn_proto(ParseContext *pc) {
 
 // VarDecl <- (KEYWORD_const / KEYWORD_var) IDENTIFIER (COLON TypeExpr)? ByteAlign? LinkSection? (EQUAL Expr)? SEMICOLON
 static AstNode *ast_parse_var_decl(ParseContext *pc) {
-    Token *first = eat_token_if(pc, TokenIdKeywordConst);
-    if (first == nullptr)
-        first = eat_token_if(pc, TokenIdKeywordVar);
-    if (first == nullptr)
-        return nullptr;
-
+    Token *thread_local_kw = eat_token_if(pc, TokenIdKeywordThreadLocal);
+    Token *mut_kw = eat_token_if(pc, TokenIdKeywordConst);
+    if (mut_kw == nullptr)
+        mut_kw = eat_token_if(pc, TokenIdKeywordVar);
+    if (mut_kw == nullptr) {
+        if (thread_local_kw == nullptr) {
+            return nullptr;
+        } else {
+            ast_invalid_token_error(pc, peek_token(pc));
+        }
+    }
     Token *identifier = expect_token(pc, TokenIdSymbol);
     AstNode *type_expr = nullptr;
     if (eat_token_if(pc, TokenIdColon) != nullptr)
@@ -863,8 +868,9 @@ static AstNode *ast_parse_var_decl(ParseContext *pc) {
 
     expect_token(pc, TokenIdSemicolon);
 
-    AstNode *res = ast_create_node(pc, NodeTypeVariableDeclaration, first);
-    res->data.variable_declaration.is_const = first->id == TokenIdKeywordConst;
+    AstNode *res = ast_create_node(pc, NodeTypeVariableDeclaration, mut_kw);
+    res->data.variable_declaration.threadlocal_tok = thread_local_kw;
+    res->data.variable_declaration.is_const = mut_kw->id == TokenIdKeywordConst;
     res->data.variable_declaration.symbol = token_buf(identifier);
     res->data.variable_declaration.type = type_expr;
     res->data.variable_declaration.align_expr = align_expr;
src/tokenizer.cpp
@@ -146,6 +146,7 @@ static const struct ZigKeyword zig_keywords[] = {
     {"suspend", TokenIdKeywordSuspend},
     {"switch", TokenIdKeywordSwitch},
     {"test", TokenIdKeywordTest},
+    {"threadlocal", TokenIdKeywordThreadLocal},
     {"true", TokenIdKeywordTrue},
     {"try", TokenIdKeywordTry},
     {"undefined", TokenIdKeywordUndefined},
@@ -1586,6 +1587,7 @@ const char * token_name(TokenId id) {
         case TokenIdKeywordStruct: return "struct";
         case TokenIdKeywordSwitch: return "switch";
         case TokenIdKeywordTest: return "test";
+        case TokenIdKeywordThreadLocal: return "threadlocal";
         case TokenIdKeywordTrue: return "true";
         case TokenIdKeywordTry: return "try";
         case TokenIdKeywordUndefined: return "undefined";
src/tokenizer.hpp
@@ -88,6 +88,7 @@ enum TokenId {
     TokenIdKeywordSuspend,
     TokenIdKeywordSwitch,
     TokenIdKeywordTest,
+    TokenIdKeywordThreadLocal,
     TokenIdKeywordTrue,
     TokenIdKeywordTry,
     TokenIdKeywordUndefined,
std/debug/index.zig
@@ -37,7 +37,6 @@ const Module = struct {
 var stderr_file: os.File = undefined;
 var stderr_file_out_stream: os.File.OutStream = undefined;
 
-/// TODO multithreaded awareness
 var stderr_stream: ?*io.OutStream(os.File.WriteError) = null;
 var stderr_mutex = std.Mutex.init();
 pub fn warn(comptime fmt: []const u8, args: ...) void {
std/os/index.zig
@@ -8,6 +8,9 @@ const is_posix = switch (builtin.os) {
 };
 const os = @This();
 
+// See the comment in startup.zig for why this does not use the `std` global above.
+const startup = @import("std").startup;
+
 test "std.os" {
     _ = @import("child_process.zig");
     _ = @import("darwin.zig");
@@ -667,14 +670,11 @@ fn posixExecveErrnoToErr(err: usize) PosixExecveError {
     }
 }
 
-pub var linux_elf_aux_maybe: ?[*]std.elf.Auxv = null;
-pub var posix_environ_raw: [][*]u8 = undefined;
-
 /// See std.elf for the constants.
 pub fn linuxGetAuxVal(index: usize) usize {
     if (builtin.link_libc) {
         return usize(std.c.getauxval(index));
-    } else if (linux_elf_aux_maybe) |auxv| {
+    } else if (startup.linux_elf_aux_maybe) |auxv| {
         var i: usize = 0;
         while (auxv[i].a_type != std.elf.AT_NULL) : (i += 1) {
             if (auxv[i].a_type == index)
@@ -692,12 +692,7 @@ pub fn getBaseAddress() usize {
                 return base;
             }
             const phdr = linuxGetAuxVal(std.elf.AT_PHDR);
-            const ElfHeader = switch (@sizeOf(usize)) {
-                4 => std.elf.Elf32_Ehdr,
-                8 => std.elf.Elf64_Ehdr,
-                else => @compileError("Unsupported architecture"),
-            };
-            return phdr - @sizeOf(ElfHeader);
+            return phdr - @sizeOf(std.elf.Ehdr);
         },
         builtin.Os.macosx, builtin.Os.freebsd => return @ptrToInt(&std.c._mh_execute_header),
         builtin.Os.windows => return @ptrToInt(windows.GetModuleHandleW(null)),
@@ -739,7 +734,7 @@ pub fn getEnvMap(allocator: *Allocator) !BufMap {
             try result.setMove(key, value);
         }
     } else {
-        for (posix_environ_raw) |ptr| {
+        for (startup.posix_environ_raw) |ptr| {
             var line_i: usize = 0;
             while (ptr[line_i] != 0 and ptr[line_i] != '=') : (line_i += 1) {}
             const key = ptr[0..line_i];
@@ -761,7 +756,7 @@ test "os.getEnvMap" {
 
 /// TODO make this go through libc when we have it
 pub fn getEnvPosix(key: []const u8) ?[]const u8 {
-    for (posix_environ_raw) |ptr| {
+    for (startup.posix_environ_raw) |ptr| {
         var line_i: usize = 0;
         while (ptr[line_i] != 0 and ptr[line_i] != '=') : (line_i += 1) {}
         const this_key = ptr[0..line_i];
@@ -1942,14 +1937,14 @@ pub const ArgIteratorPosix = struct {
     pub fn init() ArgIteratorPosix {
         return ArgIteratorPosix{
             .index = 0,
-            .count = raw.len,
+            .count = startup.posix_argv_raw.len,
         };
     }
 
     pub fn next(self: *ArgIteratorPosix) ?[]const u8 {
         if (self.index == self.count) return null;
 
-        const s = raw[self.index];
+        const s = startup.posix_argv_raw[self.index];
         self.index += 1;
         return cstr.toSlice(s);
     }
@@ -1960,10 +1955,6 @@ pub const ArgIteratorPosix = struct {
         self.index += 1;
         return true;
     }
-
-    /// This is marked as public but actually it's only meant to be used
-    /// internally by zig's startup code.
-    pub var raw: [][*]u8 = undefined;
 };
 
 pub const ArgIteratorWindows = struct {
@@ -2908,14 +2899,15 @@ pub const Thread = struct {
     pub const Data = if (use_pthreads)
         struct {
             handle: Thread.Handle,
-            stack_addr: usize,
-            stack_len: usize,
+            mmap_addr: usize,
+            mmap_len: usize,
         }
     else switch (builtin.os) {
         builtin.Os.linux => struct {
             handle: Thread.Handle,
-            stack_addr: usize,
-            stack_len: usize,
+            mmap_addr: usize,
+            mmap_len: usize,
+            tls_end_addr: usize,
         },
         builtin.Os.windows => struct {
             handle: Thread.Handle,
@@ -2955,7 +2947,7 @@ pub const Thread = struct {
                 posix.EDEADLK => unreachable,
                 else => unreachable,
             }
-            assert(posix.munmap(self.data.stack_addr, self.data.stack_len) == 0);
+            assert(posix.munmap(self.data.mmap_addr, self.data.mmap_len) == 0);
         } else switch (builtin.os) {
             builtin.Os.linux => {
                 while (true) {
@@ -2969,7 +2961,7 @@ pub const Thread = struct {
                         else => unreachable,
                     }
                 }
-                assert(posix.munmap(self.data.stack_addr, self.data.stack_len) == 0);
+                assert(posix.munmap(self.data.mmap_addr, self.data.mmap_len) == 0);
             },
             builtin.Os.windows => {
                 assert(windows.WaitForSingleObject(self.data.handle, windows.INFINITE) == windows.WAIT_OBJECT_0);
@@ -3097,42 +3089,56 @@ pub fn spawnThread(context: var, comptime startFn: var) SpawnThreadError!*Thread
 
     const MAP_GROWSDOWN = if (builtin.os == builtin.Os.linux) linux.MAP_GROWSDOWN else 0;
 
-    const mmap_len = default_stack_size;
-    const stack_addr = posix.mmap(null, mmap_len, posix.PROT_READ | posix.PROT_WRITE, posix.MAP_PRIVATE | posix.MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0);
-    if (stack_addr == posix.MAP_FAILED) return error.OutOfMemory;
-    errdefer assert(posix.munmap(stack_addr, mmap_len) == 0);
+    var stack_end_offset: usize = undefined;
+    var thread_start_offset: usize = undefined;
+    var context_start_offset: usize = undefined;
+    var tls_start_offset: usize = undefined;
+    const mmap_len = blk: {
+        // First in memory will be the stack, which grows downwards.
+        var l: usize = mem.alignForward(default_stack_size, os.page_size);
+        stack_end_offset = l;
+        // Above the stack, so that it can be in the same mmap call, put the Thread object.
+        l = mem.alignForward(l, @alignOf(Thread));
+        thread_start_offset = l;
+        l += @sizeOf(Thread);
+        // Next, the Context object.
+        if (@sizeOf(Context) != 0) {
+            l = mem.alignForward(l, @alignOf(Context));
+            context_start_offset = l;
+            l += @sizeOf(Context);
+        }
+        // Finally, the Thread Local Storage, if any.
+        if (!Thread.use_pthreads) {
+            if (startup.linux_tls_phdr) |tls_phdr| {
+                l = mem.alignForward(l, tls_phdr.p_align);
+                tls_start_offset = l;
+                l += tls_phdr.p_memsz;
+            }
+        }
+        break :blk l;
+    };
+    const mmap_addr = posix.mmap(null, mmap_len, posix.PROT_READ | posix.PROT_WRITE, posix.MAP_PRIVATE | posix.MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0);
+    if (mmap_addr == posix.MAP_FAILED) return error.OutOfMemory;
+    errdefer assert(posix.munmap(mmap_addr, mmap_len) == 0);
+
+    const thread_ptr = @alignCast(@alignOf(Thread), @intToPtr(*Thread, mmap_addr + thread_start_offset));
+    thread_ptr.data.mmap_addr = mmap_addr;
+    thread_ptr.data.mmap_len = mmap_len;
 
-    var stack_end: usize = stack_addr + mmap_len;
     var arg: usize = undefined;
     if (@sizeOf(Context) != 0) {
-        stack_end -= @sizeOf(Context);
-        stack_end -= stack_end % @alignOf(Context);
-        assert(stack_end >= stack_addr);
-        const context_ptr = @alignCast(@alignOf(Context), @intToPtr(*Context, stack_end));
+        arg = mmap_addr + context_start_offset;
+        const context_ptr = @alignCast(@alignOf(Context), @intToPtr(*Context, arg));
         context_ptr.* = context;
-        arg = stack_end;
     }
 
-    stack_end -= @sizeOf(Thread);
-    stack_end -= stack_end % @alignOf(Thread);
-    assert(stack_end >= stack_addr);
-    const thread_ptr = @alignCast(@alignOf(Thread), @intToPtr(*Thread, stack_end));
-
-    thread_ptr.data.stack_addr = stack_addr;
-    thread_ptr.data.stack_len = mmap_len;
-
-    if (builtin.os == builtin.Os.windows) {
-        // use windows API directly
-        @compileError("TODO support spawnThread for Windows");
-    } else if (Thread.use_pthreads) {
+    if (Thread.use_pthreads) {
         // use pthreads
         var attr: c.pthread_attr_t = undefined;
         if (c.pthread_attr_init(&attr) != 0) return SpawnThreadError.SystemResources;
         defer assert(c.pthread_attr_destroy(&attr) == 0);
 
-        // align to page
-        stack_end -= stack_end % os.page_size;
-        assert(c.pthread_attr_setstack(&attr, @intToPtr(*c_void, stack_addr), stack_end - stack_addr) == 0);
+        assert(c.pthread_attr_setstack(&attr, @intToPtr(*c_void, mmap_addr), stack_end_offset) == 0);
 
         const err = c.pthread_create(&thread_ptr.data.handle, &attr, MainFuncs.posixThreadMain, @intToPtr(*c_void, arg));
         switch (err) {
@@ -3143,10 +3149,17 @@ pub fn spawnThread(context: var, comptime startFn: var) SpawnThreadError!*Thread
             else => return unexpectedErrorPosix(@intCast(usize, err)),
         }
     } else if (builtin.os == builtin.Os.linux) {
-        // use linux API directly.  TODO use posix.CLONE_SETTLS and initialize thread local storage correctly
-        const flags = posix.CLONE_VM | posix.CLONE_FS | posix.CLONE_FILES | posix.CLONE_SIGHAND | posix.CLONE_THREAD | posix.CLONE_SYSVSEM | posix.CLONE_PARENT_SETTID | posix.CLONE_CHILD_CLEARTID | posix.CLONE_DETACHED;
-        const newtls: usize = 0;
-        const rc = posix.clone(MainFuncs.linuxThreadMain, stack_end, flags, arg, &thread_ptr.data.handle, newtls, &thread_ptr.data.handle);
+        var flags: u32 = posix.CLONE_VM | posix.CLONE_FS | posix.CLONE_FILES | posix.CLONE_SIGHAND |
+            posix.CLONE_THREAD | posix.CLONE_SYSVSEM | posix.CLONE_PARENT_SETTID | posix.CLONE_CHILD_CLEARTID |
+            posix.CLONE_DETACHED;
+        var newtls: usize = undefined;
+        if (startup.linux_tls_phdr) |tls_phdr| {
+            @memcpy(@intToPtr([*]u8, mmap_addr + tls_start_offset), startup.linux_tls_img_src, tls_phdr.p_filesz);
+            thread_ptr.data.tls_end_addr = mmap_addr + mmap_len;
+            newtls = @ptrToInt(&thread_ptr.data.tls_end_addr);
+            flags |= posix.CLONE_SETTLS;
+        }
+        const rc = posix.clone(MainFuncs.linuxThreadMain, mmap_addr + stack_end_offset, flags, arg, &thread_ptr.data.handle, newtls, &thread_ptr.data.handle);
         const err = posix.getErrno(rc);
         switch (err) {
             0 => return thread_ptr,
std/os/startup.zig
@@ -0,0 +1,26 @@
+// This file contains global variables that are initialized on startup from
+// std/special/bootstrap.zig. There are a few things to be aware of here.
+//
+// First, when building an object or library, and no entry point is defined
+// (such as pub fn main), std/special/bootstrap.zig is not included in the
+// compilation. And so these global variables will remain set to the values
+// you see here.
+//
+// Second, when using `zig test` to test the standard library, note that
+// `zig test` is self-hosted. This means that it uses std/special/bootstrap.zig
+// and an @import("std") from the install directory, which is distinct from
+// the standard library files that we are directly testing with `zig test`.
+// This means that these global variables would not get set. So the workaround
+// here is that references to these globals from the standard library must
+// use `@import("std").startup` rather than
+// `@import("path/to/std/index.zig").startup` (and rather than the file path of
+// this file directly). We also put "std" as a reference to itself in the
+// standard library package so that this can work.
+
+const std = @import("../index.zig");
+
+pub var linux_tls_phdr: ?*std.elf.Phdr = null;
+pub var linux_tls_img_src: [*]const u8 = undefined; // defined when linux_tls_phdr is non-null
+pub var linux_elf_aux_maybe: ?[*]std.elf.Auxv = null;
+pub var posix_environ_raw: [][*]u8 = undefined;
+pub var posix_argv_raw: [][*]u8 = undefined;
std/os/test.zig
@@ -105,3 +105,19 @@ test "AtomicFile" {
 
     try os.deleteFile(test_out_file);
 }
+
+test "thread local storage" {
+    if (builtin.single_threaded) return error.SkipZigTest;
+    const thread1 = try std.os.spawnThread({}, testTls);
+    const thread2 = try std.os.spawnThread({}, testTls);
+    testTls({});
+    thread1.wait();
+    thread2.wait();
+}
+
+threadlocal var x: i32 = 1234;
+fn testTls(context: void) void {
+    if (x != 1234) @panic("bad start value");
+    x += 1;
+    if (x != 1235) @panic("bad end value");
+}
std/special/bootstrap.zig
@@ -4,6 +4,7 @@
 const root = @import("@root");
 const std = @import("std");
 const builtin = @import("builtin");
+const assert = std.debug.assert;
 
 var argc_ptr: [*]usize = undefined;
 
@@ -61,9 +62,23 @@ fn posixCallMainAndExit() noreturn {
     while (envp_optional[envp_count]) |_| : (envp_count += 1) {}
     const envp = @ptrCast([*][*]u8, envp_optional)[0..envp_count];
     if (builtin.os == builtin.Os.linux) {
-        const auxv = @ptrCast([*]usize, envp.ptr + envp_count + 1);
-        std.os.linux_elf_aux_maybe = @ptrCast([*]std.elf.Auxv, auxv);
-        std.debug.assert(std.os.linuxGetAuxVal(std.elf.AT_PAGESZ) == std.os.page_size);
+        // Scan auxiliary vector.
+        const auxv = @ptrCast([*]std.elf.Auxv, envp.ptr + envp_count + 1);
+        std.startup.linux_elf_aux_maybe = auxv;
+        var i: usize = 0;
+        var at_phdr: usize = 0;
+        var at_phnum: usize = 0;
+        var at_phent: usize = 0;
+        while (auxv[i].a_un.a_val != 0) : (i += 1) {
+            switch (auxv[i].a_type) {
+                std.elf.AT_PAGESZ => assert(auxv[i].a_un.a_val == std.os.page_size),
+                std.elf.AT_PHDR => at_phdr = auxv[i].a_un.a_val,
+                std.elf.AT_PHNUM => at_phnum = auxv[i].a_un.a_val,
+                std.elf.AT_PHENT => at_phent = auxv[i].a_un.a_val,
+                else => {},
+            }
+        }
+        if (!builtin.single_threaded) linuxInitializeThreadLocalStorage(at_phdr, at_phnum, at_phent);
     }
 
     std.os.posix.exit(callMainWithArgs(argc, argv, envp));
@@ -72,8 +87,8 @@ fn posixCallMainAndExit() noreturn {
 // This is marked inline because for some reason LLVM in release mode fails to inline it,
 // and we want fewer call frames in stack traces.
 inline fn callMainWithArgs(argc: usize, argv: [*][*]u8, envp: [][*]u8) u8 {
-    std.os.ArgIteratorPosix.raw = argv[0..argc];
-    std.os.posix_environ_raw = envp;
+    std.startup.posix_argv_raw = argv[0..argc];
+    std.startup.posix_environ_raw = envp;
     return callMain();
 }
 
@@ -116,3 +131,41 @@ inline fn callMain() u8 {
         else => @compileError("expected return type of main to be 'u8', 'noreturn', 'void', or '!void'"),
     }
 }
+
+var tls_end_addr: usize = undefined;
+const main_thread_tls_align = 32;
+var main_thread_tls_bytes: [64]u8 align(main_thread_tls_align) = [1]u8{0} ** 64;
+
+fn linuxInitializeThreadLocalStorage(at_phdr: usize, at_phnum: usize, at_phent: usize) void {
+    var phdr_addr = at_phdr;
+    var n = at_phnum;
+    var base: usize = 0;
+    while (n != 0) : ({n -= 1; phdr_addr += at_phent;}) {
+        const phdr = @intToPtr(*std.elf.Phdr, phdr_addr);
+        // TODO look for PT_DYNAMIC when we have https://github.com/ziglang/zig/issues/1917
+        switch (phdr.p_type) {
+            std.elf.PT_PHDR => base = at_phdr - phdr.p_vaddr,
+            std.elf.PT_TLS => std.startup.linux_tls_phdr = phdr,
+            else => continue,
+        }
+    }
+    const tls_phdr = std.startup.linux_tls_phdr orelse return;
+    std.startup.linux_tls_img_src = @intToPtr([*]const u8, base + tls_phdr.p_vaddr);
+    assert(main_thread_tls_bytes.len >= tls_phdr.p_memsz); // not enough preallocated Thread Local Storage
+    assert(main_thread_tls_align >= tls_phdr.p_align); // preallocated Thread Local Storage not aligned enough
+    @memcpy(&main_thread_tls_bytes, std.startup.linux_tls_img_src, tls_phdr.p_filesz);
+    tls_end_addr = @ptrToInt(&main_thread_tls_bytes) + tls_phdr.p_memsz;
+    linuxSetThreadArea(@ptrToInt(&tls_end_addr));
+}
+
+fn linuxSetThreadArea(addr: usize) void {
+    switch (builtin.arch) {
+        builtin.Arch.x86_64 => {
+            const ARCH_SET_FS = 0x1002;
+            const rc = std.os.linux.syscall2(std.os.linux.SYS_arch_prctl, ARCH_SET_FS, addr);
+            // acrh_prctl is documented to never fail
+            assert(rc == 0);
+        },
+        else => @compileError("Unsupported architecture"),
+    }
+}
std/heap.zig
@@ -106,9 +106,7 @@ pub const DirectAllocator = struct {
                 };
                 const ptr = os.windows.HeapAlloc(heap_handle, 0, amt) orelse return error.OutOfMemory;
                 const root_addr = @ptrToInt(ptr);
-                const rem = @rem(root_addr, alignment);
-                const march_forward_bytes = if (rem == 0) 0 else (alignment - rem);
-                const adjusted_addr = root_addr + march_forward_bytes;
+                const adjusted_addr = mem.alignForward(root_addr, alignment);
                 const record_addr = adjusted_addr + n;
                 @intToPtr(*align(1) usize, record_addr).* = root_addr;
                 return @intToPtr([*]u8, adjusted_addr)[0..n];
@@ -126,8 +124,7 @@ pub const DirectAllocator = struct {
                     const base_addr = @ptrToInt(old_mem.ptr);
                     const old_addr_end = base_addr + old_mem.len;
                     const new_addr_end = base_addr + new_size;
-                    const rem = @rem(new_addr_end, os.page_size);
-                    const new_addr_end_rounded = new_addr_end + if (rem == 0) 0 else (os.page_size - rem);
+                    const new_addr_end_rounded = mem.alignForward(new_addr_end, os.page_size);
                     if (old_addr_end > new_addr_end_rounded) {
                         _ = os.posix.munmap(new_addr_end_rounded, old_addr_end - new_addr_end_rounded);
                     }
std/index.zig
@@ -33,8 +33,8 @@ pub const io = @import("io.zig");
 pub const json = @import("json.zig");
 pub const macho = @import("macho.zig");
 pub const math = @import("math/index.zig");
-pub const meta = @import("meta/index.zig");
 pub const mem = @import("mem.zig");
+pub const meta = @import("meta/index.zig");
 pub const net = @import("net.zig");
 pub const os = @import("os/index.zig");
 pub const pdb = @import("pdb.zig");
@@ -45,6 +45,7 @@ pub const unicode = @import("unicode.zig");
 pub const zig = @import("zig/index.zig");
 
 pub const lazyInit = @import("lazy_init.zig").lazyInit;
+pub const startup = @import("os/startup.zig");
 
 test "std" {
     // run tests from these
std/mem.zig
@@ -1366,3 +1366,23 @@ test "std.mem.subArrayPtr" {
     sub2[1] = 'X';
     debug.assert(std.mem.eql(u8, a2, "abcXef"));
 }
+
+/// Round an address up to the nearest aligned address
+pub fn alignForward(addr: usize, alignment: usize) usize {
+    return (addr + alignment - 1) & ~(alignment - 1);
+}
+
+test "std.mem.alignForward" {
+    debug.assertOrPanic(alignForward(1, 1) == 1);
+    debug.assertOrPanic(alignForward(2, 1) == 2);
+    debug.assertOrPanic(alignForward(1, 2) == 2);
+    debug.assertOrPanic(alignForward(2, 2) == 2);
+    debug.assertOrPanic(alignForward(3, 2) == 4);
+    debug.assertOrPanic(alignForward(4, 2) == 4);
+    debug.assertOrPanic(alignForward(7, 8) == 8);
+    debug.assertOrPanic(alignForward(8, 8) == 8);
+    debug.assertOrPanic(alignForward(9, 8) == 16);
+    debug.assertOrPanic(alignForward(15, 8) == 16);
+    debug.assertOrPanic(alignForward(16, 8) == 16);
+    debug.assertOrPanic(alignForward(17, 8) == 24);
+}
test/stage1/behavior/misc.zig
@@ -685,3 +685,11 @@ test "fn call returning scalar optional in equality expression" {
 fn getNull() ?*i32 {
     return null;
 }
+
+test "thread local variable" {
+    const S = struct {
+        threadlocal var t: i32 = 1234;
+    };
+    S.t += 1;
+    assertOrPanic(S.t == 1235);
+}
test/compile_errors.zig
@@ -1,6 +1,25 @@
 const tests = @import("tests.zig");
 
 pub fn addCases(cases: *tests.CompileErrorContext) void {
+    cases.add(
+        "threadlocal qualifier on const",
+        \\threadlocal const x: i32 = 1234;
+        \\export fn entry() i32 {
+        \\    return x;
+        \\}
+    ,
+        ".tmp_source.zig:1:13: error: threadlocal variable cannot be constant",
+    );
+
+    cases.add(
+        "threadlocal qualifier on local variable",
+        \\export fn entry() void {
+        \\    threadlocal var x: i32 = 1234;
+        \\}
+    ,
+        ".tmp_source.zig:2:5: error: function-local variable 'x' cannot be threadlocal",
+    );
+
     cases.add(
         "@bitCast same size but bit count mismatch",
         \\export fn entry(byte: u8) void {
CMakeLists.txt
@@ -587,6 +587,7 @@ set(ZIG_STD_FILES
     "os/linux/vdso.zig"
     "os/linux/x86_64.zig"
     "os/path.zig"
+    "os/startup.zig"
     "os/time.zig"
     "os/uefi.zig"
     "os/windows/advapi32.zig"