zig/lib/std/os/linux/tls.zig at master

  1//! This file implements the two TLS variants [1] used by ELF-based systems. Note that, in reality,
  2//! Variant I has two sub-variants.
  3//!
  4//! It is important to understand that the term TCB (Thread Control Block) is overloaded here.
  5//! Official ABI documentation uses it simply to mean the ABI TCB, i.e. a small area of ABI-defined
  6//! data, usually one or two words (see the `AbiTcb` type below). People will also often use TCB to
  7//! refer to the libc TCB, which can be any size and contain anything. (One could even omit it!) We
  8//! refer to the latter as the Zig TCB; see the `ZigTcb` type below.
  9//!
 10//! [1] https://www.akkadia.org/drepper/tls.pdf
 11
 12const std = @import("std");
 13const mem = std.mem;
 14const elf = std.elf;
 15const math = std.math;
 16const assert = std.debug.assert;
 17const native_arch = @import("builtin").cpu.arch;
 18const linux = std.os.linux;
 19const page_size_min = std.heap.page_size_min;
 20
 21/// Represents an ELF TLS variant.
 22///
 23/// In all variants, the TP and the TLS blocks must be aligned to the `p_align` value in the
 24/// `PT_TLS` ELF program header. Everything else has natural alignment.
 25///
 26/// The location of the DTV does not actually matter. For simplicity, we put it in the TLS area, but
 27/// there is no actual ABI requirement that it reside there.
 28const Variant = enum {
 29    /// The original Variant I:
 30    ///
 31    /// ----------------------------------------
 32    /// | DTV | Zig TCB | ABI TCB | TLS Blocks |
 33    /// ----------------^-----------------------
 34    ///                 `-- The TP register points here.
 35    ///
 36    /// The layout in this variant necessitates separate alignment of both the TP and the TLS
 37    /// blocks.
 38    ///
 39    /// The first word in the ABI TCB points to the DTV. For some architectures, there may be a
 40    /// second word with an unspecified meaning.
 41    I_original,
 42    /// The modified Variant I:
 43    ///
 44    /// ---------------------------------------------------
 45    /// | DTV | Zig TCB | ABI TCB | [Offset] | TLS Blocks |
 46    /// -------------------------------------^-------------
 47    ///                                      `-- The TP register points here.
 48    ///
 49    /// The offset (which can be zero) is applied to the TP only; there is never a physical gap
 50    /// between the ABI TCB and the TLS blocks. This implies that we only need to align the TP.
 51    ///
 52    /// The first (and only) word in the ABI TCB points to the DTV.
 53    I_modified,
 54    /// Variant II:
 55    ///
 56    /// ----------------------------------------
 57    /// | TLS Blocks | ABI TCB | Zig TCB | DTV |
 58    /// -------------^--------------------------
 59    ///              `-- The TP register points here.
 60    ///
 61    /// The first (and only) word in the ABI TCB points to the ABI TCB itself.
 62    II,
 63};
 64
 65const current_variant: Variant = switch (native_arch) {
 66    .aarch64,
 67    .aarch64_be,
 68    .alpha,
 69    .arc,
 70    .arceb,
 71    .arm,
 72    .armeb,
 73    .csky,
 74    .hppa,
 75    .microblaze,
 76    .microblazeel,
 77    .sh,
 78    .sheb,
 79    .thumb,
 80    .thumbeb,
 81    => .I_original,
 82    .loongarch32,
 83    .loongarch64,
 84    .m68k,
 85    .mips,
 86    .mipsel,
 87    .mips64,
 88    .mips64el,
 89    .or1k,
 90    .powerpc,
 91    .powerpcle,
 92    .powerpc64,
 93    .powerpc64le,
 94    .riscv32,
 95    .riscv64,
 96    => .I_modified,
 97    .hexagon,
 98    .s390x,
 99    .sparc,
100    .sparc64,
101    .x86,
102    .x86_64,
103    => .II,
104    else => @compileError("undefined TLS variant for this architecture"),
105};
106
107/// The Offset value for the modified Variant I.
108const current_tp_offset = switch (native_arch) {
109    .m68k,
110    .mips,
111    .mipsel,
112    .mips64,
113    .mips64el,
114    .powerpc,
115    .powerpcle,
116    .powerpc64,
117    .powerpc64le,
118    => 0x7000,
119    else => 0,
120};
121
122/// Usually only used by the modified Variant I.
123const current_dtv_offset = switch (native_arch) {
124    .m68k,
125    .mips,
126    .mipsel,
127    .mips64,
128    .mips64el,
129    .powerpc,
130    .powerpcle,
131    .powerpc64,
132    .powerpc64le,
133    => 0x8000,
134    .riscv32,
135    .riscv64,
136    => 0x800,
137    else => 0,
138};
139
140/// Per-thread storage for the ELF TLS ABI.
141const AbiTcb = switch (current_variant) {
142    .I_original, .I_modified => switch (native_arch) {
143        .aarch64,
144        .aarch64_be,
145        .alpha,
146        .arm,
147        .armeb,
148        .hppa,
149        .microblaze,
150        .microblazeel,
151        .sh,
152        .sheb,
153        .thumb,
154        .thumbeb,
155        => extern struct {
156            /// This is offset by `current_dtv_offset`.
157            dtv: usize,
158            _reserved: ?*anyopaque,
159        },
160        else => extern struct {
161            /// This is offset by `current_dtv_offset`.
162            dtv: usize,
163        },
164    },
165    .II => extern struct {
166        /// This is self-referential.
167        self: *AbiTcb,
168    },
169};
170
171/// Per-thread storage for Zig's use. Currently unused.
172const ZigTcb = struct {
173    dummy: usize,
174};
175
176/// Dynamic Thread Vector as specified in the ELF TLS ABI. Ordinarily, there is a block pointer per
177/// dynamically-loaded module, but since we only support static TLS, we only need one block pointer.
178const Dtv = extern struct {
179    len: usize = 1,
180    tls_block: [*]u8,
181};
182
183/// Describes a process's TLS area. The area encompasses the DTV, both TCBs, and the TLS block, with
184/// the exact layout of these being dependent primarily on `current_variant`.
185const AreaDesc = struct {
186    size: usize,
187    alignment: usize,
188
189    dtv: struct {
190        /// Offset into the TLS area.
191        offset: usize,
192    },
193
194    abi_tcb: struct {
195        /// Offset into the TLS area.
196        offset: usize,
197    },
198
199    block: struct {
200        /// The initial data to be copied into the TLS block. Note that this may be smaller than
201        /// `size`, in which case any remaining data in the TLS block is simply left uninitialized.
202        init: []const u8,
203        /// Offset into the TLS area.
204        offset: usize,
205        /// This is the effective size of the TLS block, which may be greater than `init.len`.
206        size: usize,
207    },
208
209    /// Only used on the 32-bit x86 architecture (not x86_64, nor x32).
210    gdt_entry_number: usize,
211};
212
213pub var area_desc: AreaDesc = undefined;
214
215pub fn setThreadPointer(addr: usize) void {
216    @setRuntimeSafety(false);
217    @disableInstrumentation();
218
219    switch (native_arch) {
220        .x86 => {
221            var user_desc: linux.user_desc = .{
222                .entry_number = area_desc.gdt_entry_number,
223                .base_addr = addr,
224                .limit = 0xfffff,
225                .flags = .{
226                    .seg_32bit = 1,
227                    .contents = 0, // Data
228                    .read_exec_only = 0,
229                    .limit_in_pages = 1,
230                    .seg_not_present = 0,
231                    .useable = 1,
232                },
233            };
234            const rc = @call(.always_inline, linux.syscall1, .{ .set_thread_area, @intFromPtr(&user_desc) });
235            assert(rc == 0);
236
237            const gdt_entry_number = user_desc.entry_number;
238            // We have to keep track of our slot as it's also needed for clone()
239            area_desc.gdt_entry_number = gdt_entry_number;
240            // Update the %gs selector
241            asm volatile ("movl %[gs_val], %%gs"
242                :
243                : [gs_val] "r" (gdt_entry_number << 3 | 3),
244            );
245        },
246        .x86_64 => {
247            const rc = @call(.always_inline, linux.syscall2, .{ .arch_prctl, linux.ARCH.SET_FS, addr });
248            assert(rc == 0);
249        },
250        .aarch64, .aarch64_be => {
251            asm volatile (
252                \\ msr tpidr_el0, %[addr]
253                :
254                : [addr] "r" (addr),
255            );
256        },
257        .alpha => {
258            asm volatile (
259                \\ lda a0, %[addr]
260                \\ wruniq
261                :
262                : [addr] "r" (addr),
263            );
264        },
265        .arc, .arceb => {
266            // We apparently need to both set r25 (TP) *and* inform the kernel...
267            asm volatile (
268                \\ mov r25, %[addr]
269                :
270                : [addr] "r" (addr),
271            );
272            const rc = @call(.always_inline, linux.syscall1, .{ .arc_settls, addr });
273            assert(rc == 0);
274        },
275        .arm, .armeb, .thumb, .thumbeb => {
276            const rc = @call(.always_inline, linux.syscall1, .{ .set_tls, addr });
277            assert(rc == 0);
278        },
279        .m68k => {
280            const rc = linux.syscall1(.set_thread_area, addr);
281            assert(rc == 0);
282        },
283        .hexagon => {
284            asm volatile (
285                \\ ugp = %[addr]
286                :
287                : [addr] "r" (addr),
288            );
289        },
290        .hppa => {
291            asm volatile (
292                \\ ble 0xe0(%%sr2, %%r0)
293                :
294                : [addr] "={r26}" (addr),
295                : .{ .r29 = true });
296        },
297        .loongarch32, .loongarch64 => {
298            asm volatile (
299                \\ move $tp, %[addr]
300                :
301                : [addr] "r" (addr),
302            );
303        },
304        .riscv32, .riscv64 => {
305            asm volatile (
306                \\ mv tp, %[addr]
307                :
308                : [addr] "r" (addr),
309            );
310        },
311        .csky, .mips, .mipsel, .mips64, .mips64el => {
312            const rc = @call(.always_inline, linux.syscall1, .{ .set_thread_area, addr });
313            assert(rc == 0);
314        },
315        .microblaze, .microblazeel => {
316            asm volatile (
317                \\ ori r21, %[addr], 0
318                :
319                : [addr] "r" (addr),
320            );
321        },
322        .or1k => {
323            asm volatile (
324                \\ l.ori r10, %[addr], 0
325                :
326                : [addr] "r" (addr),
327            );
328        },
329        .powerpc, .powerpcle => {
330            asm volatile (
331                \\ mr 2, %[addr]
332                :
333                : [addr] "r" (addr),
334            );
335        },
336        .powerpc64, .powerpc64le => {
337            asm volatile (
338                \\ mr 13, %[addr]
339                :
340                : [addr] "r" (addr),
341            );
342        },
343        .s390x => {
344            asm volatile (
345                \\ lgr %%r0, %[addr]
346                \\ sar %%a1, %%r0
347                \\ srlg %%r0, %%r0, 32
348                \\ sar %%a0, %%r0
349                :
350                : [addr] "r" (addr),
351                : .{ .r0 = true });
352        },
353        .sh, .sheb => {
354            asm volatile (
355                \\ ldc gbr, %[addr]
356                :
357                : [addr] "r" (addr),
358            );
359        },
360        .sparc, .sparc64 => {
361            asm volatile (
362                \\ mov %[addr], %%g7
363                :
364                : [addr] "r" (addr),
365            );
366        },
367        else => @compileError("Unsupported architecture"),
368    }
369}
370
371fn computeAreaDesc(phdrs: []elf.Phdr) void {
372    @setRuntimeSafety(false);
373    @disableInstrumentation();
374
375    var tls_phdr: ?*elf.Phdr = null;
376    var img_base: usize = 0;
377
378    for (phdrs) |*phdr| {
379        switch (phdr.p_type) {
380            elf.PT_PHDR => img_base = @intFromPtr(phdrs.ptr) - phdr.p_vaddr,
381            elf.PT_TLS => tls_phdr = phdr,
382            else => {},
383        }
384    }
385
386    var align_factor: usize = undefined;
387    var block_init: []const u8 = undefined;
388    var block_size: usize = undefined;
389
390    if (tls_phdr) |phdr| {
391        align_factor = phdr.p_align;
392
393        // The effective size in memory is represented by `p_memsz`; the length of the data stored
394        // in the `PT_TLS` segment is `p_filesz` and may be less than the former.
395        block_init = @as([*]u8, @ptrFromInt(img_base + phdr.p_vaddr))[0..phdr.p_filesz];
396        block_size = phdr.p_memsz;
397    } else {
398        align_factor = @alignOf(usize);
399
400        block_init = &[_]u8{};
401        block_size = 0;
402    }
403
404    // Offsets into the allocated TLS area.
405    var dtv_offset: usize = undefined;
406    var abi_tcb_offset: usize = undefined;
407    var block_offset: usize = undefined;
408
409    // Compute the total size of the ABI-specific data plus our own `ZigTcb` structure. All the
410    // offsets calculated here assume a well-aligned base address.
411    const area_size = switch (current_variant) {
412        .I_original => blk: {
413            var l: usize = 0;
414            dtv_offset = l;
415            l += @sizeOf(Dtv);
416            // Add some padding here so that the TP (`abi_tcb_offset`) is aligned to `align_factor`
417            // and the `ZigTcb` structure can be found by simply subtracting `@sizeOf(ZigTcb)` from
418            // the TP.
419            const delta = (l + @sizeOf(ZigTcb)) & (align_factor - 1);
420            if (delta > 0)
421                l += align_factor - delta;
422            l += @sizeOf(ZigTcb);
423            abi_tcb_offset = l;
424            l += alignForward(@sizeOf(AbiTcb), align_factor);
425            block_offset = l;
426            l += block_size;
427            break :blk l;
428        },
429        .I_modified => blk: {
430            var l: usize = 0;
431            dtv_offset = l;
432            l += @sizeOf(Dtv);
433            // In this variant, the TLS blocks must begin immediately after the end of the ABI TCB,
434            // with the TP pointing to the beginning of the TLS blocks. Add padding so that the TP
435            // (`abi_tcb_offset`) is aligned to `align_factor` and the `ZigTcb` structure can be
436            // found by subtracting `@sizeOf(AbiTcb) + @sizeOf(ZigTcb)` from the TP.
437            const delta = (l + @sizeOf(ZigTcb) + @sizeOf(AbiTcb)) & (align_factor - 1);
438            if (delta > 0)
439                l += align_factor - delta;
440            l += @sizeOf(ZigTcb);
441            abi_tcb_offset = l;
442            l += @sizeOf(AbiTcb);
443            block_offset = l;
444            l += block_size;
445            break :blk l;
446        },
447        .II => blk: {
448            var l: usize = 0;
449            block_offset = l;
450            l += alignForward(block_size, align_factor);
451            // The TP is aligned to `align_factor`.
452            abi_tcb_offset = l;
453            l += @sizeOf(AbiTcb);
454            // The `ZigTcb` structure is right after the `AbiTcb` with no padding in between so it
455            // can be easily found.
456            l += @sizeOf(ZigTcb);
457            // It doesn't really matter where we put the DTV, so give it natural alignment.
458            l = alignForward(l, @alignOf(Dtv));
459            dtv_offset = l;
460            l += @sizeOf(Dtv);
461            break :blk l;
462        },
463    };
464
465    area_desc = .{
466        .size = area_size,
467        .alignment = align_factor,
468
469        .dtv = .{
470            .offset = dtv_offset,
471        },
472
473        .abi_tcb = .{
474            .offset = abi_tcb_offset,
475        },
476
477        .block = .{
478            .init = block_init,
479            .offset = block_offset,
480            .size = block_size,
481        },
482
483        .gdt_entry_number = @as(usize, @bitCast(@as(isize, -1))),
484    };
485}
486
487/// Inline because TLS is not set up yet.
488inline fn alignForward(addr: usize, alignment: usize) usize {
489    return alignBackward(addr + (alignment - 1), alignment);
490}
491
492/// Inline because TLS is not set up yet.
493inline fn alignBackward(addr: usize, alignment: usize) usize {
494    return addr & ~(alignment - 1);
495}
496
497/// Inline because TLS is not set up yet.
498inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
499    return @ptrCast(@alignCast(ptr));
500}
501
502/// Initializes all the fields of the static TLS area and returns the computed architecture-specific
503/// value of the TP register.
504pub fn prepareArea(area: []u8) usize {
505    @setRuntimeSafety(false);
506    @disableInstrumentation();
507
508    // Clear the area we're going to use, just to be safe.
509    @memset(area, 0);
510
511    // Prepare the ABI TCB.
512    const abi_tcb = alignPtrCast(AbiTcb, area.ptr + area_desc.abi_tcb.offset);
513    switch (current_variant) {
514        .I_original, .I_modified => abi_tcb.dtv = @intFromPtr(area.ptr + area_desc.dtv.offset),
515        .II => abi_tcb.self = abi_tcb,
516    }
517
518    // Prepare the DTV.
519    const dtv = alignPtrCast(Dtv, area.ptr + area_desc.dtv.offset);
520    dtv.len = 1;
521    dtv.tls_block = area.ptr + current_dtv_offset + area_desc.block.offset;
522
523    // Copy the initial data.
524    @memcpy(area[area_desc.block.offset..][0..area_desc.block.init.len], area_desc.block.init);
525
526    // Return the corrected value (if needed) for the TP register. Overflow here is not a problem;
527    // the pointer arithmetic involving the TP is done with wrapping semantics.
528    return @intFromPtr(area.ptr) +% switch (current_variant) {
529        .I_original, .II => area_desc.abi_tcb.offset,
530        .I_modified => area_desc.block.offset +% current_tp_offset,
531    };
532}
533
534/// The main motivation for the size chosen here is that this is how much ends up being requested for
535/// the thread-local variables of the `std.crypto.random` implementation. I'm not sure why it ends up
536/// being so much; the struct itself is only 64 bytes. I think it has to do with being page-aligned
537/// and LLVM or LLD is not smart enough to lay out the TLS data in a space-conserving way. Anyway, I
538/// think it's fine because it's less than 3 pages of memory, and putting it in the ELF like this is
539/// equivalent to moving the `mmap` call below into the kernel, avoiding syscall overhead.
540var main_thread_area_buffer: [0x2100]u8 align(page_size_min) = undefined;
541
542/// Computes the layout of the static TLS area, allocates the area, initializes all of its fields,
543/// and assigns the architecture-specific value to the TP register.
544pub fn initStatic(phdrs: []elf.Phdr) void {
545    @setRuntimeSafety(false);
546    @disableInstrumentation();
547
548    computeAreaDesc(phdrs);
549
550    const area = blk: {
551        // Fast path for the common case where the TLS data is really small, avoid an allocation and
552        // use our local buffer.
553        if (area_desc.alignment <= page_size_min and area_desc.size <= main_thread_area_buffer.len) {
554            break :blk main_thread_area_buffer[0..area_desc.size];
555        }
556
557        const begin_addr = mmap_tls(area_desc.size + area_desc.alignment - 1);
558        if (@call(.always_inline, linux.errno, .{begin_addr}) != .SUCCESS) @trap();
559
560        const area_ptr: [*]align(page_size_min) u8 = @ptrFromInt(begin_addr);
561
562        // Make sure the slice is correctly aligned.
563        const begin_aligned_addr = alignForward(begin_addr, area_desc.alignment);
564        const start = begin_aligned_addr - begin_addr;
565        break :blk area_ptr[start..][0..area_desc.size];
566    };
567
568    const tp_value = prepareArea(area);
569    setThreadPointer(tp_value);
570}
571
572inline fn mmap_tls(length: usize) usize {
573    const prot = linux.PROT.READ | linux.PROT.WRITE;
574    const flags: linux.MAP = .{ .TYPE = .PRIVATE, .ANONYMOUS = true };
575
576    if (@hasField(linux.SYS, "mmap2")) {
577        return @call(.always_inline, linux.syscall6, .{
578            .mmap2,
579            0,
580            length,
581            prot,
582            @as(u32, @bitCast(flags)),
583            @as(usize, @bitCast(@as(isize, -1))),
584            0,
585        });
586    } else {
587        // The s390x mmap() syscall existed before Linux supported syscalls with 5+ parameters, so
588        // it takes a single pointer to an array of arguments instead.
589        return if (native_arch == .s390x) @call(.always_inline, linux.syscall1, .{
590            .mmap,
591            @intFromPtr(&[_]usize{
592                0,
593                length,
594                prot,
595                @as(u32, @bitCast(flags)),
596                @as(usize, @bitCast(@as(isize, -1))),
597                0,
598            }),
599        }) else @call(.always_inline, linux.syscall6, .{
600            .mmap,
601            0,
602            length,
603            prot,
604            @as(u32, @bitCast(flags)),
605            @as(usize, @bitCast(@as(isize, -1))),
606            0,
607        });
608    }
609}