Commit b7a887f0fb

dweiller <4678790+dweiller@users.noreply.github.com>
2025-01-20 07:43:42
compiler-rt: optimize memcpy
The new memcpy function aims to be more generic than the previous implementation which was adapted from an implementation optimized for x86_64 avx2 machines. Even on x86_64 avx2 machines this implementation should be generally be faster due to fewer branches in the small length cases and generating less machine code. Note that the new memcpy function no longer acts as a memmove.
1 parent d6e1166
Changed files (1)
lib
compiler_rt
lib/compiler_rt/memcpy.zig
@@ -1,4 +1,5 @@
 const std = @import("std");
+const assert = std.debug.assert;
 const common = @import("./common.zig");
 const builtin = @import("builtin");
 
@@ -17,6 +18,19 @@ comptime {
     }
 }
 
+const Element = if (std.simd.suggestVectorLength(u8)) |vec_size|
+    @Type(.{ .vector = .{
+        .child = u8,
+        .len = vec_size,
+    } })
+else
+    usize;
+
+comptime {
+    assert(@sizeOf(Element) >= @alignOf(Element));
+    assert(std.math.isPowerOfTwo(@sizeOf(Element)));
+}
+
 fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
     @setRuntimeSafety(builtin.is_test);
 
@@ -27,6 +41,173 @@ fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) call
     return dest;
 }
 
-fn memcpyFast(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
-    return @call(.always_inline, @import("memmove.zig").memmove, .{ opt_dest, opt_src, len });
+fn memcpyFast(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
+    @setRuntimeSafety(builtin.is_test);
+
+    const small_limit = 2 * @sizeOf(Element);
+
+    if (copySmallLength(small_limit, dest.?, src.?, len)) return dest;
+
+    copyForwards(dest.?, src.?, len);
+
+    return dest;
+}
+
+inline fn copySmallLength(
+    comptime small_limit: comptime_int,
+    dest: [*]u8,
+    src: [*]const u8,
+    len: usize,
+) bool {
+    if (len < 16) {
+        copyLessThan16(dest, src, len);
+        return true;
+    }
+
+    if (comptime 2 < (std.math.log2(small_limit) + 1) / 2) {
+        if (copy16ToSmallLimit(small_limit, dest, src, len)) return true;
+    }
+
+    return false;
+}
+
+inline fn copyLessThan16(
+    dest: [*]u8,
+    src: [*]const u8,
+    len: usize,
+) void {
+    @setRuntimeSafety(builtin.is_test);
+    if (len < 4) {
+        if (len == 0) return;
+        dest[0] = src[0];
+        dest[len / 2] = src[len / 2];
+        dest[len - 1] = src[len - 1];
+        return;
+    }
+    copyRange4(4, dest, src, len);
+}
+
+inline fn copy16ToSmallLimit(
+    comptime small_limit: comptime_int,
+    dest: [*]u8,
+    src: [*]const u8,
+    len: usize,
+) bool {
+    @setRuntimeSafety(builtin.is_test);
+    inline for (2..(std.math.log2(small_limit) + 1) / 2 + 1) |p| {
+        const limit = 1 << (2 * p);
+        if (len < limit) {
+            copyRange4(limit / 4, dest, src, len);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline fn copyForwards(
+    noalias dest: [*]u8,
+    noalias src: [*]const u8,
+    len: usize,
+) void {
+    @setRuntimeSafety(builtin.is_test);
+    assert(len >= 2 * @sizeOf(Element));
+
+    dest[0..@sizeOf(Element)].* = src[0..@sizeOf(Element)].*;
+    const alignment_offset = @alignOf(Element) - @intFromPtr(src) % @alignOf(Element);
+    const n = len - alignment_offset;
+    const d = dest + alignment_offset;
+    const s = src + alignment_offset;
+
+    copyBlocksAlignedSource(@ptrCast(d), @alignCast(@ptrCast(s)), n);
+
+    // copy last `@sizeOf(Element)` bytes unconditionally, since block copy
+    // methods only copy a multiple of `@sizeOf(Element)` bytes.
+    const offset = len - @sizeOf(Element);
+    dest[offset..][0..@sizeOf(Element)].* = src[offset..][0..@sizeOf(Element)].*;
+}
+
+inline fn copyBlocksAlignedSource(
+    noalias dest: [*]align(1) Element,
+    noalias src: [*]const Element,
+    max_bytes: usize,
+) void {
+    copyBlocks(dest, src, max_bytes);
+}
+
+/// Copies the largest multiple of `@sizeOf(T)` bytes from `src` to `dest`,
+/// that is less than `max_bytes` where `T` is the child type of `src` and
+/// `dest`.
+inline fn copyBlocks(
+    noalias dest: anytype,
+    noalias src: anytype,
+    max_bytes: usize,
+) void {
+    @setRuntimeSafety(builtin.is_test);
+
+    const T = @typeInfo(@TypeOf(dest)).pointer.child;
+    comptime assert(T == @typeInfo(@TypeOf(src)).pointer.child);
+
+    const loop_count = max_bytes / @sizeOf(T);
+
+    for (dest[0..loop_count], src[0..loop_count]) |*d, s| {
+        d.* = s;
+    }
+}
+
+/// copy `len` bytes from `src` to `dest`; `len` must be in the range
+/// `[copy_len, 4 * copy_len)`.
+inline fn copyRange4(
+    comptime copy_len: comptime_int,
+    noalias dest: [*]u8,
+    noalias src: [*]const u8,
+    len: usize,
+) void {
+    @setRuntimeSafety(builtin.is_test);
+    comptime assert(std.math.isPowerOfTwo(copy_len));
+    assert(len >= copy_len);
+    assert(len < 4 * copy_len);
+
+    const a = len & (copy_len * 2);
+    const b = a / 2;
+
+    const last = len - copy_len;
+    const pen = last - b;
+
+    dest[0..copy_len].* = src[0..copy_len].*;
+    dest[b..][0..copy_len].* = src[b..][0..copy_len].*;
+    dest[pen..][0..copy_len].* = src[pen..][0..copy_len].*;
+    dest[last..][0..copy_len].* = src[last..][0..copy_len].*;
+}
+
+test {
+    const S = struct {
+        fn testFunc(comptime copy_func: anytype) !void {
+            const max_len = 1024;
+            var buffer: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined;
+            for (&buffer, 0..) |*b, i| {
+                b.* = @intCast(i % 97);
+            }
+            var dest: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined;
+
+            for (0..max_len) |copy_len| {
+                for (0..@alignOf(Element)) |s_offset| {
+                    for (0..@alignOf(Element)) |d_offset| {
+                        @memset(&dest, 0xff);
+                        const s = buffer[s_offset..][0..copy_len];
+                        const d = dest[d_offset..][0..copy_len];
+                        _ = copy_func(@ptrCast(d.ptr), @ptrCast(s.ptr), s.len);
+                        std.testing.expectEqualSlices(u8, s, d) catch |e| {
+                            std.debug.print("error encountered for length={d}, s_offset={d}, d_offset={d}\n", .{
+                                copy_len, s_offset, d_offset,
+                            });
+                            return e;
+                        };
+                    }
+                }
+            }
+        }
+    };
+
+    try S.testFunc(memcpySmall);
+    try S.testFunc(memcpyFast);
 }