Commit 89d15a8d47

Pat Tullmann <pat.github@tullmann.org>
2025-04-03 01:21:23
linux: futex v2 API updates
* `futex2_waitv` always takes a 64-bit timespec. Perhaps the `kernel_timespec` should be renamed `timespec64`? Its used in iouring, too. * Add `packed struct` for futex v2 flags and parameters. * Add very basic "tests" for the futex v2 syscalls (just to ensure the code compiles). * Update the stale or broken comments. (I could also just delete these they're not really documenting Zig-specific behavior.) Given that the futex2 APIs are not used by Zig's library (they're a bit too new), and the fact that these are very specialized syscalls, and they currently provide no benefit over the existing v1 API, I wonder if instead of fixing these up, we should just replace them with a stub that says 'use a 3rd party library'.
1 parent cfe5def
Changed files (2)
lib
std
lib/std/os/linux/test.zig
@@ -297,6 +297,136 @@ test "futex v1" {
     }
 }
 
+comptime {
+    std.debug.assert(2 == @as(u32, @bitCast(linux.FUTEX2_FLAGS{ .size = .U32, .private = false })));
+    std.debug.assert(128 == @as(u32, @bitCast(linux.FUTEX2_FLAGS{ .size = @enumFromInt(0), .private = true })));
+}
+
+test "futex2_waitv" {
+    const locks = [_]std.atomic.Value(u32){
+        std.atomic.Value(u32).init(1),
+        std.atomic.Value(u32).init(1),
+        std.atomic.Value(u32).init(1),
+    };
+
+    const futexes = [_]linux.futex2_waitone{
+        .{
+            .val = 1,
+            .uaddr = @intFromPtr(&locks[0].raw),
+            .flags = .{ .size = .U32, .private = true },
+        },
+        .{
+            .val = 1,
+            .uaddr = @intFromPtr(&locks[1].raw),
+            .flags = .{ .size = .U32, .private = true },
+        },
+        .{
+            .val = 1,
+            .uaddr = @intFromPtr(&locks[2].raw),
+            .flags = .{ .size = .U32, .private = true },
+        },
+    };
+
+    const timeout = linux.kernel_timespec{ .sec = 0, .nsec = 2 }; // absolute timeout, so this is 1970...
+    const rc = linux.futex2_waitv(&futexes, futexes.len, .{}, &timeout, .MONOTONIC);
+    switch (linux.E.init(rc)) {
+        .NOSYS => return error.SkipZigTest, // futex2_waitv added in kernel v5.16
+        else => |err| try expectEqual(.TIMEDOUT, err),
+    }
+}
+
+// Futex v2 API is only supported on recent kernels (v6.7), so skip tests if the syscalls
+// return ENOSYS.
+fn futex2_skip_if_unsupported() !void {
+    const lock: u32 = 0;
+    const rc = linux.futex2_wake(&lock, 0, 1, .{ .size = .U32, .private = true });
+    if (linux.E.init(rc) == .NOSYS) {
+        return error.SkipZigTest;
+    }
+}
+
+test "futex2_wait" {
+    var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1);
+    var rc: usize = 0;
+    const mask = 0x1;
+
+    try futex2_skip_if_unsupported();
+
+    // The API for 8,16,64 bit futexes is defined, but as of kernel v6.14
+    // (at least) they're not implemented.
+    if (false) {
+        rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U8, .private = true }, null, .MONOTONIC);
+        try expectEqual(.INVAL, linux.E.init(rc));
+
+        rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U16, .private = true }, null, .MONOTONIC);
+        try expectEqual(.INVAL, linux.E.init(rc));
+
+        rc = linux.futex2_wait(&lock.raw, 1, mask, .{ .size = .U64, .private = true }, null, .MONOTONIC);
+        try expectEqual(.INVAL, linux.E.init(rc));
+    }
+
+    const flags = linux.FUTEX2_FLAGS{ .size = .U32, .private = true };
+    // no-wait, lock state mismatch
+    rc = linux.futex2_wait(&lock.raw, 2, mask, flags, null, .MONOTONIC);
+    try expectEqual(.AGAIN, linux.E.init(rc));
+
+    // hit timeout on wait
+    rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .MONOTONIC);
+    try expectEqual(.TIMEDOUT, linux.E.init(rc));
+
+    // timeout is absolute
+    {
+        var curr: linux.timespec = undefined;
+        rc = linux.clock_gettime(.MONOTONIC, &curr); // gettime() uses platform timespec
+        try expectEqual(0, rc);
+
+        // ... but futex2_wait always uses 64-bit timespec
+        var timeout: linux.kernel_timespec = .{
+            .sec = curr.sec,
+            .nsec = curr.nsec + 2,
+        };
+        rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &timeout, .MONOTONIC);
+        try expectEqual(.TIMEDOUT, linux.E.init(rc));
+    }
+
+    rc = linux.futex2_wait(&lock.raw, 1, mask, flags, &.{ .sec = 0, .nsec = 2 }, .REALTIME);
+    try expectEqual(.TIMEDOUT, linux.E.init(rc));
+}
+
+test "futex2_wake" {
+    var lock: std.atomic.Value(u32) = std.atomic.Value(u32).init(1);
+
+    try futex2_skip_if_unsupported();
+
+    const rc = linux.futex2_wake(&lock.raw, 0xFF, 1, .{ .size = .U32, .private = true });
+    try expectEqual(0, rc);
+}
+
+test "futex2_requeue" {
+    try futex2_skip_if_unsupported();
+
+    const locks = [_]std.atomic.Value(u32){
+        std.atomic.Value(u32).init(1),
+        std.atomic.Value(u32).init(1),
+    };
+
+    const futexes = [_]linux.futex2_waitone{
+        .{
+            .val = 1,
+            .uaddr = @intFromPtr(&locks[0].raw),
+            .flags = .{ .size = .U32, .private = true },
+        },
+        .{
+            .val = 1,
+            .uaddr = @intFromPtr(&locks[1].raw),
+            .flags = .{ .size = .U32, .private = true },
+        },
+    };
+
+    const rc = linux.futex2_requeue(&futexes, .{}, 2, 2);
+    try expectEqual(0, rc);
+}
+
 test {
     _ = linux.IoUring;
 }
lib/std/os/linux.zig
@@ -703,15 +703,13 @@ pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout
     return syscall4(.futex, @intFromPtr(uaddr), @as(u32, @bitCast(futex_op)), val, @intFromPtr(timeout));
 }
 
-/// Given an array of `futex_waitv`, wait on each uaddr.
+/// Given an array of `futex2_waitone`, wait on each uaddr.
 /// The thread wakes if a futex_wake() is performed at any uaddr.
-/// The syscall returns immediately if any waiter has *uaddr != val.
-/// timeout is an optional timeout value for the operation.
-/// Each waiter has individual flags.
-/// The `flags` argument for the syscall should be used solely for specifying
-/// the timeout as realtime, if needed.
-/// Flags for private futexes, sizes, etc. should be used on the
-/// individual flags of each waiter.
+/// The syscall returns immediately if any futex has *uaddr != val.
+/// timeout is an optional, absolute timeout value for the operation.
+/// The `flags` argument is for future use and currently should be `.{}`.
+/// Flags for private futexes, sizes, etc. should be set on the
+/// individual flags of each `futex2_waitone`.
 ///
 /// Returns the array index of one of the woken futexes.
 /// No further information is provided: any number of other futexes may also
@@ -719,42 +717,43 @@ pub fn futex_4arg(uaddr: *const anyopaque, futex_op: FUTEX_OP, val: u32, timeout
 /// the returned index may refer to any one of them.
 /// (It is not necessaryily the futex with the smallest index, nor the one
 /// most recently woken, nor...)
+///
+/// Requires at least kernel v5.16.
 pub fn futex2_waitv(
-    /// List of futexes to wait on.
-    waiters: [*]futex_waitv,
-    /// Length of `waiters`.
+    futexes: [*]const futex2_waitone,
+    /// Length of `futexes`.  Max of FUTEX2_WAITONE_MAX.
     nr_futexes: u32,
-    /// Flag for timeout (monotonic/realtime).
-    flags: u32,
-    /// Optional absolute timeout.
-    timeout: ?*const timespec,
+    flags: FUTEX2_FLAGS_WAITV,
+    /// Optional absolute timeout.  Always 64-bit, even on 32-bit platforms.
+    timeout: ?*const kernel_timespec,
     /// Clock to be used for the timeout, realtime or monotonic.
     clockid: clockid_t,
 ) usize {
     return syscall5(
         .futex_waitv,
-        @intFromPtr(waiters),
+        @intFromPtr(futexes),
         nr_futexes,
-        flags,
+        @as(u32, @bitCast(flags)),
         @intFromPtr(timeout),
-        @bitCast(@as(isize, @intFromEnum(clockid))),
+        @intFromEnum(clockid),
     );
 }
 
-/// Wait on a futex.
-/// Identical to the traditional `FUTEX.FUTEX_WAIT_BITSET` op, except it is part of the
-/// futex2 familiy of calls.
+/// Wait on a single futex.
+/// Identical to the futex v1 `FUTEX.FUTEX_WAIT_BITSET` op, except it is part of the
+/// futex2 family of calls.
+///
+/// Requires at least kernel v6.7.
 pub fn futex2_wait(
     /// Address of the futex to wait on.
     uaddr: *const anyopaque,
     /// Value of `uaddr`.
     val: usize,
-    /// Bitmask.
+    /// Bitmask to match against incoming wakeup masks.  Must not be zero.
     mask: usize,
-    /// `FUTEX2` flags.
-    flags: u32,
-    /// Optional absolute timeout.
-    timeout: ?*const timespec,
+    flags: FUTEX2_FLAGS,
+    /// Optional absolute timeout.  Always 64-bit, even on 32-bit platforms.
+    timeout: ?*const kernel_timespec,
     /// Clock to be used for the timeout, realtime or monotonic.
     clockid: clockid_t,
 ) usize {
@@ -763,52 +762,55 @@ pub fn futex2_wait(
         @intFromPtr(uaddr),
         val,
         mask,
-        flags,
+        @as(u32, @bitCast(flags)),
         @intFromPtr(timeout),
-        @bitCast(@as(isize, @intFromEnum(clockid))),
+        @intFromEnum(clockid),
     );
 }
 
-/// Wake a number of futexes.
-/// Identical to the traditional `FUTEX.FUTEX_WAIT_BITSET` op, except it is part of the
+/// Wake (subset of) waiters on given futex.
+/// Identical to the traditional `FUTEX.FUTEX_WAKE_BITSET` op, except it is part of the
 /// futex2 family of calls.
+///
+/// Requires at least kernel v6.7.
 pub fn futex2_wake(
-    /// Address of the futex(es) to wake.
+    /// Futex to wake
     uaddr: *const anyopaque,
-    /// Bitmask
+    /// Bitmask to match against waiters.
     mask: usize,
-    /// Number of the futexes to wake.
-    nr: i32,
-    /// `FUTEX2` flags.
-    flags: u32,
+    /// Maximum number of waiters on the futex to wake.
+    nr_wake: i32,
+    flags: FUTEX2_FLAGS,
 ) usize {
     return syscall4(
         .futex_wake,
         @intFromPtr(uaddr),
         mask,
-        @bitCast(@as(isize, nr)),
-        flags,
+        @as(u32, @bitCast(nr_wake)),
+        @as(u32, @bitCast(flags)),
     );
 }
 
-/// Requeue a waiter from one futex to another.
+/// Wake and/or requeue waiter(s) from one futex to another.
 /// Identical to `FUTEX.CMP_REQUEUE`, except it is part of the futex2 family of calls.
+///
+/// Requires at least kernel v6.7.
 pub fn futex2_requeue(
-    /// Array describing the source and destination futex.
-    waiters: [*]futex_waitv,
-    /// Unused.
-    flags: u32,
-    /// Number of futexes to wake.
+    /// The source and destination futexes.  Must be a 2-element array.
+    waiters: [*]const futex2_waitone,
+    /// Currently unused.
+    flags: FUTEX2_FLAGS_REQUEUE,
+    /// Maximum number of waiters to wake on the source futex.
     nr_wake: i32,
-    /// Number of futexes to requeue.
+    /// Maximum number of waiters to transfer to the destination futex.
     nr_requeue: i32,
 ) usize {
     return syscall4(
         .futex_requeue,
         @intFromPtr(waiters),
-        flags,
-        @bitCast(@as(isize, nr_wake)),
-        @bitCast(@as(isize, nr_requeue)),
+        @as(u32, @bitCast(flags)),
+        @as(u32, @bitCast(nr_wake)),
+        @as(u32, @bitCast(nr_requeue)),
     );
 }
 
@@ -3407,16 +3409,6 @@ pub const FALLOC = struct {
     pub const FL_UNSHARE_RANGE = 0x40;
 };
 
-pub const FUTEX2 = struct {
-    pub const SIZE_U8 = 0x00;
-    pub const SIZE_U16 = 0x01;
-    pub const SIZE_U32 = 0x02;
-    pub const SIZE_U64 = 0x03;
-    pub const NUMA = 0x04;
-
-    pub const PRIVATE = FUTEX.PRIVATE_FLAG;
-};
-
 // Futex v1 API commands.  See futex man page for each command's
 // interpretation of the futex arguments.
 pub const FUTEX_COMMAND = enum(u7) {
@@ -3477,8 +3469,38 @@ pub const FUTEX_WAKE_OP_CMP = enum(u4) {
     GE = 5,
 };
 
-/// Max numbers of elements in a `futex_waitv` array.
-pub const FUTEX2_WAITV_MAX = 128;
+/// Max numbers of elements in a `futex2_waitone` array.
+pub const FUTEX2_WAITONE_MAX = 128;
+
+/// For futex v2 API, the size of the futex at the uaddr.  v1 futex are
+/// always implicitly U32.  As of kernel v6.14, only U32 is implemented
+/// for v2 futexes.
+pub const FUTEX2_SIZE = enum(u2) {
+    U8 = 0,
+    U16 = 1,
+    U32 = 2,
+    U64 = 3,
+};
+
+/// As of kernel 6.14 there are no defined flags to futex2_waitv.
+pub const FUTEX2_FLAGS_WAITV = packed struct(u32) {
+    _reserved: u32 = 0,
+};
+
+/// As of kernel 6.14 there are no defined flags to futex2_requeue.
+pub const FUTEX2_FLAGS_REQUEUE = packed struct(u32) {
+    _reserved: u32 = 0,
+};
+
+/// Flags for futex v2 APIs (futex2_wait, futex2_wake, futex2_requeue, but
+/// not the futex2_waitv syscall, but also used in the futex2_waitone struct).
+pub const FUTEX2_FLAGS = packed struct(u32) {
+    size: FUTEX2_SIZE,
+    numa: bool = false,
+    _reserved: u4 = 0,
+    private: bool,
+    _undefined: u24 = 0,
+};
 
 pub const PROT = struct {
     /// page can not be accessed
@@ -9343,17 +9365,17 @@ pub const PTRACE = struct {
     pub const GET_SYSCALL_INFO = 0x420e;
 };
 
-/// A waiter for vectorized wait.
-pub const futex_waitv = extern struct {
-    // Expected value at uaddr
+/// For futex2_waitv and futex2_requeue. Arrays of `futex2_waitone` allow
+/// waiting on multiple futexes in one call.
+pub const futex2_waitone = extern struct {
+    /// Expected value at uaddr, should match size of futex.
     val: u64,
-    /// User address to wait on.
+    /// User address to wait on.  Top-bits must be 0 on 32-bit.
     uaddr: u64,
     /// Flags for this waiter.
-    flags: u32,
+    flags: FUTEX2_FLAGS,
     /// Reserved member to preserve alignment.
-    /// Should be 0.
-    __reserved: u32,
+    __reserved: u32 = 0,
 };
 
 pub const cache_stat_range = extern struct {