master
  1//! An allocator that is designed for ReleaseFast optimization mode, with
  2//! multi-threading enabled.
  3//!
  4//! This allocator is a singleton; it uses global state and only one should be
  5//! instantiated for the entire process.
  6//!
  7//! ## Basic Design
  8//!
  9//! Each thread gets a separate freelist, however, the data must be recoverable
 10//! when the thread exits. We do not directly learn when a thread exits, so
 11//! occasionally, one thread must attempt to reclaim another thread's
 12//! resources.
 13//!
 14//! Above a certain size, those allocations are memory mapped directly, with no
 15//! storage of allocation metadata. This works because the implementation
 16//! refuses resizes that would move an allocation from small category to large
 17//! category or vice versa.
 18//!
 19//! Each allocator operation checks the thread identifier from a threadlocal
 20//! variable to find out which metadata in the global state to access, and
 21//! attempts to grab its lock. This will usually succeed without contention,
 22//! unless another thread has been assigned the same id. In the case of such
 23//! contention, the thread moves on to the next thread metadata slot and
 24//! repeats the process of attempting to obtain the lock.
 25//!
 26//! By limiting the thread-local metadata array to the same number as the CPU
 27//! count, ensures that as threads are created and destroyed, they cycle
 28//! through the full set of freelists.
 29
 30const builtin = @import("builtin");
 31
 32const std = @import("../std.zig");
 33const assert = std.debug.assert;
 34const mem = std.mem;
 35const math = std.math;
 36const Allocator = std.mem.Allocator;
 37const SmpAllocator = @This();
 38const PageAllocator = std.heap.PageAllocator;
 39
 40cpu_count: u32,
 41threads: [max_thread_count]Thread,
 42
 43var global: SmpAllocator = .{
 44    .threads = @splat(.{}),
 45    .cpu_count = 0,
 46};
 47threadlocal var thread_index: u32 = 0;
 48
 49const max_thread_count = 128;
 50const slab_len: usize = @max(std.heap.page_size_max, 64 * 1024);
 51/// Because of storing free list pointers, the minimum size class is 3.
 52const min_class = math.log2(@sizeOf(usize));
 53const size_class_count = math.log2(slab_len) - min_class;
 54/// Before mapping a fresh page, `alloc` will rotate this many times.
 55const max_alloc_search = 1;
 56
 57const Thread = struct {
 58    /// Avoid false sharing.
 59    _: void align(std.atomic.cache_line) = {},
 60
 61    /// Protects the state in this struct (per-thread state).
 62    ///
 63    /// Threads lock this before accessing their own state in order
 64    /// to support freelist reclamation.
 65    mutex: std.Thread.Mutex = .{},
 66
 67    /// For each size class, tracks the next address to be returned from
 68    /// `alloc` when the freelist is empty.
 69    next_addrs: [size_class_count]usize = @splat(0),
 70    /// For each size class, points to the freed pointer.
 71    frees: [size_class_count]usize = @splat(0),
 72
 73    fn lock() *Thread {
 74        var index = thread_index;
 75        {
 76            const t = &global.threads[index];
 77            if (t.mutex.tryLock()) {
 78                @branchHint(.likely);
 79                return t;
 80            }
 81        }
 82        const cpu_count = getCpuCount();
 83        assert(cpu_count != 0);
 84        while (true) {
 85            index = (index + 1) % cpu_count;
 86            const t = &global.threads[index];
 87            if (t.mutex.tryLock()) {
 88                thread_index = index;
 89                return t;
 90            }
 91        }
 92    }
 93
 94    fn unlock(t: *Thread) void {
 95        t.mutex.unlock();
 96    }
 97};
 98
 99fn getCpuCount() u32 {
100    const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered);
101    if (cpu_count != 0) return cpu_count;
102    const n: u32 = @min(std.Thread.getCpuCount() catch max_thread_count, max_thread_count);
103    return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n;
104}
105
106pub const vtable: Allocator.VTable = .{
107    .alloc = alloc,
108    .resize = resize,
109    .remap = remap,
110    .free = free,
111};
112
113comptime {
114    assert(!builtin.single_threaded); // you're holding it wrong
115}
116
117fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
118    _ = context;
119    _ = ra;
120    const class = sizeClassIndex(len, alignment);
121    if (class >= size_class_count) {
122        @branchHint(.unlikely);
123        return PageAllocator.map(len, alignment);
124    }
125
126    const slot_size = slotSize(class);
127    assert(slab_len % slot_size == 0);
128    var search_count: u8 = 0;
129
130    var t = Thread.lock();
131
132    outer: while (true) {
133        const top_free_ptr = t.frees[class];
134        if (top_free_ptr != 0) {
135            @branchHint(.likely);
136            defer t.unlock();
137            const node: *usize = @ptrFromInt(top_free_ptr);
138            t.frees[class] = node.*;
139            return @ptrFromInt(top_free_ptr);
140        }
141
142        const next_addr = t.next_addrs[class];
143        if ((next_addr % slab_len) != 0) {
144            @branchHint(.likely);
145            defer t.unlock();
146            t.next_addrs[class] = next_addr + slot_size;
147            return @ptrFromInt(next_addr);
148        }
149
150        if (search_count >= max_alloc_search) {
151            @branchHint(.likely);
152            defer t.unlock();
153            // slab alignment here ensures the % slab len earlier catches the end of slots.
154            const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null;
155            t.next_addrs[class] = @intFromPtr(slab) + slot_size;
156            return slab;
157        }
158
159        t.unlock();
160        const cpu_count = getCpuCount();
161        assert(cpu_count != 0);
162        var index = thread_index;
163        while (true) {
164            index = (index + 1) % cpu_count;
165            t = &global.threads[index];
166            if (t.mutex.tryLock()) {
167                thread_index = index;
168                search_count += 1;
169                continue :outer;
170            }
171        }
172    }
173}
174
175fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool {
176    _ = context;
177    _ = ra;
178    const class = sizeClassIndex(memory.len, alignment);
179    const new_class = sizeClassIndex(new_len, alignment);
180    if (class >= size_class_count) {
181        if (new_class < size_class_count) return false;
182        return PageAllocator.realloc(memory, new_len, false) != null;
183    }
184    return new_class == class;
185}
186
187fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 {
188    _ = context;
189    _ = ra;
190    const class = sizeClassIndex(memory.len, alignment);
191    const new_class = sizeClassIndex(new_len, alignment);
192    if (class >= size_class_count) {
193        if (new_class < size_class_count) return null;
194        return PageAllocator.realloc(memory, new_len, true);
195    }
196    return if (new_class == class) memory.ptr else null;
197}
198
199fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void {
200    _ = context;
201    _ = ra;
202    const class = sizeClassIndex(memory.len, alignment);
203    if (class >= size_class_count) {
204        @branchHint(.unlikely);
205        return PageAllocator.unmap(@alignCast(memory));
206    }
207
208    const node: *usize = @ptrCast(@alignCast(memory.ptr));
209
210    const t = Thread.lock();
211    defer t.unlock();
212
213    node.* = t.frees[class];
214    t.frees[class] = @intFromPtr(node);
215}
216
217fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {
218    return @max(@bitSizeOf(usize) - @clz(len - 1), @intFromEnum(alignment), min_class) - min_class;
219}
220
221fn slotSize(class: usize) usize {
222    return @as(usize, 1) << @intCast(class + min_class);
223}