master
1//! This file implements the two TLS variants [1] used by ELF-based systems. Note that, in reality,
2//! Variant I has two sub-variants.
3//!
4//! It is important to understand that the term TCB (Thread Control Block) is overloaded here.
5//! Official ABI documentation uses it simply to mean the ABI TCB, i.e. a small area of ABI-defined
6//! data, usually one or two words (see the `AbiTcb` type below). People will also often use TCB to
7//! refer to the libc TCB, which can be any size and contain anything. (One could even omit it!) We
8//! refer to the latter as the Zig TCB; see the `ZigTcb` type below.
9//!
10//! [1] https://www.akkadia.org/drepper/tls.pdf
11
12const std = @import("std");
13const mem = std.mem;
14const elf = std.elf;
15const math = std.math;
16const assert = std.debug.assert;
17const native_arch = @import("builtin").cpu.arch;
18const linux = std.os.linux;
19const page_size_min = std.heap.page_size_min;
20
21/// Represents an ELF TLS variant.
22///
23/// In all variants, the TP and the TLS blocks must be aligned to the `p_align` value in the
24/// `PT_TLS` ELF program header. Everything else has natural alignment.
25///
26/// The location of the DTV does not actually matter. For simplicity, we put it in the TLS area, but
27/// there is no actual ABI requirement that it reside there.
28const Variant = enum {
29 /// The original Variant I:
30 ///
31 /// ----------------------------------------
32 /// | DTV | Zig TCB | ABI TCB | TLS Blocks |
33 /// ----------------^-----------------------
34 /// `-- The TP register points here.
35 ///
36 /// The layout in this variant necessitates separate alignment of both the TP and the TLS
37 /// blocks.
38 ///
39 /// The first word in the ABI TCB points to the DTV. For some architectures, there may be a
40 /// second word with an unspecified meaning.
41 I_original,
42 /// The modified Variant I:
43 ///
44 /// ---------------------------------------------------
45 /// | DTV | Zig TCB | ABI TCB | [Offset] | TLS Blocks |
46 /// -------------------------------------^-------------
47 /// `-- The TP register points here.
48 ///
49 /// The offset (which can be zero) is applied to the TP only; there is never a physical gap
50 /// between the ABI TCB and the TLS blocks. This implies that we only need to align the TP.
51 ///
52 /// The first (and only) word in the ABI TCB points to the DTV.
53 I_modified,
54 /// Variant II:
55 ///
56 /// ----------------------------------------
57 /// | TLS Blocks | ABI TCB | Zig TCB | DTV |
58 /// -------------^--------------------------
59 /// `-- The TP register points here.
60 ///
61 /// The first (and only) word in the ABI TCB points to the ABI TCB itself.
62 II,
63};
64
65const current_variant: Variant = switch (native_arch) {
66 .aarch64,
67 .aarch64_be,
68 .alpha,
69 .arc,
70 .arceb,
71 .arm,
72 .armeb,
73 .csky,
74 .hppa,
75 .microblaze,
76 .microblazeel,
77 .sh,
78 .sheb,
79 .thumb,
80 .thumbeb,
81 => .I_original,
82 .loongarch32,
83 .loongarch64,
84 .m68k,
85 .mips,
86 .mipsel,
87 .mips64,
88 .mips64el,
89 .or1k,
90 .powerpc,
91 .powerpcle,
92 .powerpc64,
93 .powerpc64le,
94 .riscv32,
95 .riscv64,
96 => .I_modified,
97 .hexagon,
98 .s390x,
99 .sparc,
100 .sparc64,
101 .x86,
102 .x86_64,
103 => .II,
104 else => @compileError("undefined TLS variant for this architecture"),
105};
106
107/// The Offset value for the modified Variant I.
108const current_tp_offset = switch (native_arch) {
109 .m68k,
110 .mips,
111 .mipsel,
112 .mips64,
113 .mips64el,
114 .powerpc,
115 .powerpcle,
116 .powerpc64,
117 .powerpc64le,
118 => 0x7000,
119 else => 0,
120};
121
122/// Usually only used by the modified Variant I.
123const current_dtv_offset = switch (native_arch) {
124 .m68k,
125 .mips,
126 .mipsel,
127 .mips64,
128 .mips64el,
129 .powerpc,
130 .powerpcle,
131 .powerpc64,
132 .powerpc64le,
133 => 0x8000,
134 .riscv32,
135 .riscv64,
136 => 0x800,
137 else => 0,
138};
139
140/// Per-thread storage for the ELF TLS ABI.
141const AbiTcb = switch (current_variant) {
142 .I_original, .I_modified => switch (native_arch) {
143 .aarch64,
144 .aarch64_be,
145 .alpha,
146 .arm,
147 .armeb,
148 .hppa,
149 .microblaze,
150 .microblazeel,
151 .sh,
152 .sheb,
153 .thumb,
154 .thumbeb,
155 => extern struct {
156 /// This is offset by `current_dtv_offset`.
157 dtv: usize,
158 _reserved: ?*anyopaque,
159 },
160 else => extern struct {
161 /// This is offset by `current_dtv_offset`.
162 dtv: usize,
163 },
164 },
165 .II => extern struct {
166 /// This is self-referential.
167 self: *AbiTcb,
168 },
169};
170
171/// Per-thread storage for Zig's use. Currently unused.
172const ZigTcb = struct {
173 dummy: usize,
174};
175
176/// Dynamic Thread Vector as specified in the ELF TLS ABI. Ordinarily, there is a block pointer per
177/// dynamically-loaded module, but since we only support static TLS, we only need one block pointer.
178const Dtv = extern struct {
179 len: usize = 1,
180 tls_block: [*]u8,
181};
182
183/// Describes a process's TLS area. The area encompasses the DTV, both TCBs, and the TLS block, with
184/// the exact layout of these being dependent primarily on `current_variant`.
185const AreaDesc = struct {
186 size: usize,
187 alignment: usize,
188
189 dtv: struct {
190 /// Offset into the TLS area.
191 offset: usize,
192 },
193
194 abi_tcb: struct {
195 /// Offset into the TLS area.
196 offset: usize,
197 },
198
199 block: struct {
200 /// The initial data to be copied into the TLS block. Note that this may be smaller than
201 /// `size`, in which case any remaining data in the TLS block is simply left uninitialized.
202 init: []const u8,
203 /// Offset into the TLS area.
204 offset: usize,
205 /// This is the effective size of the TLS block, which may be greater than `init.len`.
206 size: usize,
207 },
208
209 /// Only used on the 32-bit x86 architecture (not x86_64, nor x32).
210 gdt_entry_number: usize,
211};
212
213pub var area_desc: AreaDesc = undefined;
214
215pub fn setThreadPointer(addr: usize) void {
216 @setRuntimeSafety(false);
217 @disableInstrumentation();
218
219 switch (native_arch) {
220 .x86 => {
221 var user_desc: linux.user_desc = .{
222 .entry_number = area_desc.gdt_entry_number,
223 .base_addr = addr,
224 .limit = 0xfffff,
225 .flags = .{
226 .seg_32bit = 1,
227 .contents = 0, // Data
228 .read_exec_only = 0,
229 .limit_in_pages = 1,
230 .seg_not_present = 0,
231 .useable = 1,
232 },
233 };
234 const rc = @call(.always_inline, linux.syscall1, .{ .set_thread_area, @intFromPtr(&user_desc) });
235 assert(rc == 0);
236
237 const gdt_entry_number = user_desc.entry_number;
238 // We have to keep track of our slot as it's also needed for clone()
239 area_desc.gdt_entry_number = gdt_entry_number;
240 // Update the %gs selector
241 asm volatile ("movl %[gs_val], %%gs"
242 :
243 : [gs_val] "r" (gdt_entry_number << 3 | 3),
244 );
245 },
246 .x86_64 => {
247 const rc = @call(.always_inline, linux.syscall2, .{ .arch_prctl, linux.ARCH.SET_FS, addr });
248 assert(rc == 0);
249 },
250 .aarch64, .aarch64_be => {
251 asm volatile (
252 \\ msr tpidr_el0, %[addr]
253 :
254 : [addr] "r" (addr),
255 );
256 },
257 .alpha => {
258 asm volatile (
259 \\ lda a0, %[addr]
260 \\ wruniq
261 :
262 : [addr] "r" (addr),
263 );
264 },
265 .arc, .arceb => {
266 // We apparently need to both set r25 (TP) *and* inform the kernel...
267 asm volatile (
268 \\ mov r25, %[addr]
269 :
270 : [addr] "r" (addr),
271 );
272 const rc = @call(.always_inline, linux.syscall1, .{ .arc_settls, addr });
273 assert(rc == 0);
274 },
275 .arm, .armeb, .thumb, .thumbeb => {
276 const rc = @call(.always_inline, linux.syscall1, .{ .set_tls, addr });
277 assert(rc == 0);
278 },
279 .m68k => {
280 const rc = linux.syscall1(.set_thread_area, addr);
281 assert(rc == 0);
282 },
283 .hexagon => {
284 asm volatile (
285 \\ ugp = %[addr]
286 :
287 : [addr] "r" (addr),
288 );
289 },
290 .hppa => {
291 asm volatile (
292 \\ ble 0xe0(%%sr2, %%r0)
293 :
294 : [addr] "={r26}" (addr),
295 : .{ .r29 = true });
296 },
297 .loongarch32, .loongarch64 => {
298 asm volatile (
299 \\ move $tp, %[addr]
300 :
301 : [addr] "r" (addr),
302 );
303 },
304 .riscv32, .riscv64 => {
305 asm volatile (
306 \\ mv tp, %[addr]
307 :
308 : [addr] "r" (addr),
309 );
310 },
311 .csky, .mips, .mipsel, .mips64, .mips64el => {
312 const rc = @call(.always_inline, linux.syscall1, .{ .set_thread_area, addr });
313 assert(rc == 0);
314 },
315 .microblaze, .microblazeel => {
316 asm volatile (
317 \\ ori r21, %[addr], 0
318 :
319 : [addr] "r" (addr),
320 );
321 },
322 .or1k => {
323 asm volatile (
324 \\ l.ori r10, %[addr], 0
325 :
326 : [addr] "r" (addr),
327 );
328 },
329 .powerpc, .powerpcle => {
330 asm volatile (
331 \\ mr 2, %[addr]
332 :
333 : [addr] "r" (addr),
334 );
335 },
336 .powerpc64, .powerpc64le => {
337 asm volatile (
338 \\ mr 13, %[addr]
339 :
340 : [addr] "r" (addr),
341 );
342 },
343 .s390x => {
344 asm volatile (
345 \\ lgr %%r0, %[addr]
346 \\ sar %%a1, %%r0
347 \\ srlg %%r0, %%r0, 32
348 \\ sar %%a0, %%r0
349 :
350 : [addr] "r" (addr),
351 : .{ .r0 = true });
352 },
353 .sh, .sheb => {
354 asm volatile (
355 \\ ldc gbr, %[addr]
356 :
357 : [addr] "r" (addr),
358 );
359 },
360 .sparc, .sparc64 => {
361 asm volatile (
362 \\ mov %[addr], %%g7
363 :
364 : [addr] "r" (addr),
365 );
366 },
367 else => @compileError("Unsupported architecture"),
368 }
369}
370
371fn computeAreaDesc(phdrs: []elf.Phdr) void {
372 @setRuntimeSafety(false);
373 @disableInstrumentation();
374
375 var tls_phdr: ?*elf.Phdr = null;
376 var img_base: usize = 0;
377
378 for (phdrs) |*phdr| {
379 switch (phdr.p_type) {
380 elf.PT_PHDR => img_base = @intFromPtr(phdrs.ptr) - phdr.p_vaddr,
381 elf.PT_TLS => tls_phdr = phdr,
382 else => {},
383 }
384 }
385
386 var align_factor: usize = undefined;
387 var block_init: []const u8 = undefined;
388 var block_size: usize = undefined;
389
390 if (tls_phdr) |phdr| {
391 align_factor = phdr.p_align;
392
393 // The effective size in memory is represented by `p_memsz`; the length of the data stored
394 // in the `PT_TLS` segment is `p_filesz` and may be less than the former.
395 block_init = @as([*]u8, @ptrFromInt(img_base + phdr.p_vaddr))[0..phdr.p_filesz];
396 block_size = phdr.p_memsz;
397 } else {
398 align_factor = @alignOf(usize);
399
400 block_init = &[_]u8{};
401 block_size = 0;
402 }
403
404 // Offsets into the allocated TLS area.
405 var dtv_offset: usize = undefined;
406 var abi_tcb_offset: usize = undefined;
407 var block_offset: usize = undefined;
408
409 // Compute the total size of the ABI-specific data plus our own `ZigTcb` structure. All the
410 // offsets calculated here assume a well-aligned base address.
411 const area_size = switch (current_variant) {
412 .I_original => blk: {
413 var l: usize = 0;
414 dtv_offset = l;
415 l += @sizeOf(Dtv);
416 // Add some padding here so that the TP (`abi_tcb_offset`) is aligned to `align_factor`
417 // and the `ZigTcb` structure can be found by simply subtracting `@sizeOf(ZigTcb)` from
418 // the TP.
419 const delta = (l + @sizeOf(ZigTcb)) & (align_factor - 1);
420 if (delta > 0)
421 l += align_factor - delta;
422 l += @sizeOf(ZigTcb);
423 abi_tcb_offset = l;
424 l += alignForward(@sizeOf(AbiTcb), align_factor);
425 block_offset = l;
426 l += block_size;
427 break :blk l;
428 },
429 .I_modified => blk: {
430 var l: usize = 0;
431 dtv_offset = l;
432 l += @sizeOf(Dtv);
433 // In this variant, the TLS blocks must begin immediately after the end of the ABI TCB,
434 // with the TP pointing to the beginning of the TLS blocks. Add padding so that the TP
435 // (`abi_tcb_offset`) is aligned to `align_factor` and the `ZigTcb` structure can be
436 // found by subtracting `@sizeOf(AbiTcb) + @sizeOf(ZigTcb)` from the TP.
437 const delta = (l + @sizeOf(ZigTcb) + @sizeOf(AbiTcb)) & (align_factor - 1);
438 if (delta > 0)
439 l += align_factor - delta;
440 l += @sizeOf(ZigTcb);
441 abi_tcb_offset = l;
442 l += @sizeOf(AbiTcb);
443 block_offset = l;
444 l += block_size;
445 break :blk l;
446 },
447 .II => blk: {
448 var l: usize = 0;
449 block_offset = l;
450 l += alignForward(block_size, align_factor);
451 // The TP is aligned to `align_factor`.
452 abi_tcb_offset = l;
453 l += @sizeOf(AbiTcb);
454 // The `ZigTcb` structure is right after the `AbiTcb` with no padding in between so it
455 // can be easily found.
456 l += @sizeOf(ZigTcb);
457 // It doesn't really matter where we put the DTV, so give it natural alignment.
458 l = alignForward(l, @alignOf(Dtv));
459 dtv_offset = l;
460 l += @sizeOf(Dtv);
461 break :blk l;
462 },
463 };
464
465 area_desc = .{
466 .size = area_size,
467 .alignment = align_factor,
468
469 .dtv = .{
470 .offset = dtv_offset,
471 },
472
473 .abi_tcb = .{
474 .offset = abi_tcb_offset,
475 },
476
477 .block = .{
478 .init = block_init,
479 .offset = block_offset,
480 .size = block_size,
481 },
482
483 .gdt_entry_number = @as(usize, @bitCast(@as(isize, -1))),
484 };
485}
486
487/// Inline because TLS is not set up yet.
488inline fn alignForward(addr: usize, alignment: usize) usize {
489 return alignBackward(addr + (alignment - 1), alignment);
490}
491
492/// Inline because TLS is not set up yet.
493inline fn alignBackward(addr: usize, alignment: usize) usize {
494 return addr & ~(alignment - 1);
495}
496
497/// Inline because TLS is not set up yet.
498inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
499 return @ptrCast(@alignCast(ptr));
500}
501
502/// Initializes all the fields of the static TLS area and returns the computed architecture-specific
503/// value of the TP register.
504pub fn prepareArea(area: []u8) usize {
505 @setRuntimeSafety(false);
506 @disableInstrumentation();
507
508 // Clear the area we're going to use, just to be safe.
509 @memset(area, 0);
510
511 // Prepare the ABI TCB.
512 const abi_tcb = alignPtrCast(AbiTcb, area.ptr + area_desc.abi_tcb.offset);
513 switch (current_variant) {
514 .I_original, .I_modified => abi_tcb.dtv = @intFromPtr(area.ptr + area_desc.dtv.offset),
515 .II => abi_tcb.self = abi_tcb,
516 }
517
518 // Prepare the DTV.
519 const dtv = alignPtrCast(Dtv, area.ptr + area_desc.dtv.offset);
520 dtv.len = 1;
521 dtv.tls_block = area.ptr + current_dtv_offset + area_desc.block.offset;
522
523 // Copy the initial data.
524 @memcpy(area[area_desc.block.offset..][0..area_desc.block.init.len], area_desc.block.init);
525
526 // Return the corrected value (if needed) for the TP register. Overflow here is not a problem;
527 // the pointer arithmetic involving the TP is done with wrapping semantics.
528 return @intFromPtr(area.ptr) +% switch (current_variant) {
529 .I_original, .II => area_desc.abi_tcb.offset,
530 .I_modified => area_desc.block.offset +% current_tp_offset,
531 };
532}
533
534/// The main motivation for the size chosen here is that this is how much ends up being requested for
535/// the thread-local variables of the `std.crypto.random` implementation. I'm not sure why it ends up
536/// being so much; the struct itself is only 64 bytes. I think it has to do with being page-aligned
537/// and LLVM or LLD is not smart enough to lay out the TLS data in a space-conserving way. Anyway, I
538/// think it's fine because it's less than 3 pages of memory, and putting it in the ELF like this is
539/// equivalent to moving the `mmap` call below into the kernel, avoiding syscall overhead.
540var main_thread_area_buffer: [0x2100]u8 align(page_size_min) = undefined;
541
542/// Computes the layout of the static TLS area, allocates the area, initializes all of its fields,
543/// and assigns the architecture-specific value to the TP register.
544pub fn initStatic(phdrs: []elf.Phdr) void {
545 @setRuntimeSafety(false);
546 @disableInstrumentation();
547
548 computeAreaDesc(phdrs);
549
550 const area = blk: {
551 // Fast path for the common case where the TLS data is really small, avoid an allocation and
552 // use our local buffer.
553 if (area_desc.alignment <= page_size_min and area_desc.size <= main_thread_area_buffer.len) {
554 break :blk main_thread_area_buffer[0..area_desc.size];
555 }
556
557 const begin_addr = mmap_tls(area_desc.size + area_desc.alignment - 1);
558 if (@call(.always_inline, linux.errno, .{begin_addr}) != .SUCCESS) @trap();
559
560 const area_ptr: [*]align(page_size_min) u8 = @ptrFromInt(begin_addr);
561
562 // Make sure the slice is correctly aligned.
563 const begin_aligned_addr = alignForward(begin_addr, area_desc.alignment);
564 const start = begin_aligned_addr - begin_addr;
565 break :blk area_ptr[start..][0..area_desc.size];
566 };
567
568 const tp_value = prepareArea(area);
569 setThreadPointer(tp_value);
570}
571
572inline fn mmap_tls(length: usize) usize {
573 const prot = linux.PROT.READ | linux.PROT.WRITE;
574 const flags: linux.MAP = .{ .TYPE = .PRIVATE, .ANONYMOUS = true };
575
576 if (@hasField(linux.SYS, "mmap2")) {
577 return @call(.always_inline, linux.syscall6, .{
578 .mmap2,
579 0,
580 length,
581 prot,
582 @as(u32, @bitCast(flags)),
583 @as(usize, @bitCast(@as(isize, -1))),
584 0,
585 });
586 } else {
587 // The s390x mmap() syscall existed before Linux supported syscalls with 5+ parameters, so
588 // it takes a single pointer to an array of arguments instead.
589 return if (native_arch == .s390x) @call(.always_inline, linux.syscall1, .{
590 .mmap,
591 @intFromPtr(&[_]usize{
592 0,
593 length,
594 prot,
595 @as(u32, @bitCast(flags)),
596 @as(usize, @bitCast(@as(isize, -1))),
597 0,
598 }),
599 }) else @call(.always_inline, linux.syscall6, .{
600 .mmap,
601 0,
602 length,
603 prot,
604 @as(u32, @bitCast(flags)),
605 @as(usize, @bitCast(@as(isize, -1))),
606 0,
607 });
608 }
609}