Commit 9295355985

Andrew Kelley <andrew@ziglang.org>
2023-04-26 22:41:02
LLVM backend: optimize memset with comptime-known element
When the element is comptime-known, we can check if it has a repeated byte representation. In this case, `@memset` can be lowered with the LLVM intrinsic rather than with a loop.
1 parent 51adbf4
Changed files (4)
src
test
behavior
src/codegen/llvm.zig
@@ -8424,28 +8424,45 @@ pub const FuncGen = struct {
         const dest_slice = try self.resolveInst(bin_op.lhs);
         const ptr_ty = self.air.typeOf(bin_op.lhs);
         const elem_ty = self.air.typeOf(bin_op.rhs);
-        const target = self.dg.module.getTarget();
-        const val_is_undef = if (self.air.value(bin_op.rhs)) |val| val.isUndefDeep() else false;
+        const module = self.dg.module;
+        const target = module.getTarget();
         const dest_ptr_align = ptr_ty.ptrAlignment(target);
         const u8_llvm_ty = self.context.intType(8);
         const dest_ptr = self.sliceOrArrayPtr(dest_slice, ptr_ty);
         const is_volatile = ptr_ty.isVolatilePtr();
 
-        if (val_is_undef) {
-            // Even if safety is disabled, we still emit a memset to undefined since it conveys
-            // extra information to LLVM. However, safety makes the difference between using
-            // 0xaa or actual undefined for the fill byte.
-            const fill_byte = if (safety)
-                u8_llvm_ty.constInt(0xaa, .False)
-            else
-                u8_llvm_ty.getUndef();
-            const len = self.sliceOrArrayLenInBytes(dest_slice, ptr_ty);
-            _ = self.builder.buildMemSet(dest_ptr, fill_byte, len, dest_ptr_align, is_volatile);
+        if (self.air.value(bin_op.rhs)) |elem_val| {
+            if (elem_val.isUndefDeep()) {
+                // Even if safety is disabled, we still emit a memset to undefined since it conveys
+                // extra information to LLVM. However, safety makes the difference between using
+                // 0xaa or actual undefined for the fill byte.
+                const fill_byte = if (safety)
+                    u8_llvm_ty.constInt(0xaa, .False)
+                else
+                    u8_llvm_ty.getUndef();
+                const len = self.sliceOrArrayLenInBytes(dest_slice, ptr_ty);
+                _ = self.builder.buildMemSet(dest_ptr, fill_byte, len, dest_ptr_align, is_volatile);
 
-            if (safety and self.dg.module.comp.bin_file.options.valgrind) {
-                self.valgrindMarkUndef(dest_ptr, len);
+                if (safety and module.comp.bin_file.options.valgrind) {
+                    self.valgrindMarkUndef(dest_ptr, len);
+                }
+                return null;
+            }
+
+            // Test if the element value is compile-time known to be a
+            // repeating byte pattern, for example, `@as(u64, 0)` has a
+            // repeating byte pattern of 0 bytes. In such case, the memset
+            // intrinsic can be used.
+            var value_buffer: Value.Payload.U64 = undefined;
+            if (try elem_val.hasRepeatedByteRepr(elem_ty, module, &value_buffer)) |byte_val| {
+                const fill_byte = try self.resolveValue(.{
+                    .ty = Type.u8,
+                    .val = byte_val,
+                });
+                const len = self.sliceOrArrayLenInBytes(dest_slice, ptr_ty);
+                _ = self.builder.buildMemSet(dest_ptr, fill_byte, len, dest_ptr_align, is_volatile);
+                return null;
             }
-            return null;
         }
 
         const value = try self.resolveInst(bin_op.rhs);
src/Sema.zig
@@ -26953,9 +26953,11 @@ fn storePtrVal(
             defer sema.gpa.free(buffer);
             reinterpret.val_ptr.*.writeToMemory(mut_kit.ty, sema.mod, buffer) catch |err| switch (err) {
                 error.ReinterpretDeclRef => unreachable,
+                error.IllDefinedMemoryLayout => unreachable, // Sema was supposed to emit a compile error already
             };
             operand_val.writeToMemory(operand_ty, sema.mod, buffer[reinterpret.byte_offset..]) catch |err| switch (err) {
                 error.ReinterpretDeclRef => unreachable,
+                error.IllDefinedMemoryLayout => unreachable, // Sema was supposed to emit a compile error already
             };
 
             const arena = mut_kit.beginArena(sema.mod);
@@ -27905,6 +27907,7 @@ fn bitCastVal(
     defer sema.gpa.free(buffer);
     val.writeToMemory(old_ty, sema.mod, buffer) catch |err| switch (err) {
         error.ReinterpretDeclRef => return null,
+        error.IllDefinedMemoryLayout => unreachable, // Sema was supposed to emit a compile error already
     };
     return try Value.readFromMemory(new_ty, sema.mod, buffer[buffer_offset..], sema.arena);
 }
src/value.zig
@@ -1278,7 +1278,10 @@ pub const Value = extern union {
     ///
     /// Asserts that buffer.len >= ty.abiSize(). The buffer is allowed to extend past
     /// the end of the value in memory.
-    pub fn writeToMemory(val: Value, ty: Type, mod: *Module, buffer: []u8) error{ReinterpretDeclRef}!void {
+    pub fn writeToMemory(val: Value, ty: Type, mod: *Module, buffer: []u8) error{
+        ReinterpretDeclRef,
+        IllDefinedMemoryLayout,
+    }!void {
         const target = mod.getTarget();
         const endian = target.cpu.arch.endian();
         if (val.isUndef()) {
@@ -1345,7 +1348,7 @@ pub const Value = extern union {
                 return writeToPackedMemory(val, ty, mod, buffer[0..byte_count], 0);
             },
             .Struct => switch (ty.containerLayout()) {
-                .Auto => unreachable, // Sema is supposed to have emitted a compile error already
+                .Auto => return error.IllDefinedMemoryLayout,
                 .Extern => {
                     const fields = ty.structFields().values();
                     const field_vals = val.castTag(.aggregate).?.data;
@@ -1366,7 +1369,7 @@ pub const Value = extern union {
                 std.mem.writeInt(Int, buffer[0..@sizeOf(Int)], @intCast(Int, int), endian);
             },
             .Union => switch (ty.containerLayout()) {
-                .Auto => unreachable,
+                .Auto => return error.IllDefinedMemoryLayout,
                 .Extern => @panic("TODO implement writeToMemory for extern unions"),
                 .Packed => {
                     const byte_count = (@intCast(usize, ty.bitSize(target)) + 7) / 8;
@@ -5381,6 +5384,35 @@ pub const Value = extern union {
         }
     }
 
+    /// If the value is represented in-memory as a series of bytes that all
+    /// have the same value, return that byte value, otherwise null.
+    pub fn hasRepeatedByteRepr(val: Value, ty: Type, mod: *Module, value_buffer: *Payload.U64) !?Value {
+        const target = mod.getTarget();
+        const abi_size = ty.abiSize(target);
+        assert(abi_size >= 1);
+        const byte_buffer = try mod.gpa.alloc(u8, abi_size);
+        defer mod.gpa.free(byte_buffer);
+
+        writeToMemory(val, ty, mod, byte_buffer) catch |err| switch (err) {
+            error.ReinterpretDeclRef => return null,
+            // TODO: The writeToMemory function was originally created for the purpose
+            // of comptime pointer casting. However, it is now additionally being used
+            // for checking the actual memory layout that will be generated by machine
+            // code late in compilation. So, this error handling is too aggressive and
+            // causes some false negatives, causing less-than-ideal code generation.
+            error.IllDefinedMemoryLayout => return null,
+        };
+        const first_byte = byte_buffer[0];
+        for (byte_buffer[1..]) |byte| {
+            if (byte != first_byte) return null;
+        }
+        value_buffer.* = .{
+            .base = .{ .tag = .int_u64 },
+            .data = first_byte,
+        };
+        return initPayload(&value_buffer.base);
+    }
+
     /// This type is not copyable since it may contain pointers to its inner data.
     pub const Payload = struct {
         tag: Tag,
test/behavior/memset.zig
@@ -94,7 +94,7 @@ test "memset with 1-byte array element" {
     try expect(buf[4][0]);
 }
 
-test "memset with large array element" {
+test "memset with large array element, runtime known" {
     const A = [128]u64;
     var buf: [5]A = undefined;
     var runtime_known_element = [_]u64{0} ** 128;
@@ -106,6 +106,18 @@ test "memset with large array element" {
     for (buf[4]) |elem| try expect(elem == 0);
 }
 
+test "memset with large array element, comptime known" {
+    const A = [128]u64;
+    var buf: [5]A = undefined;
+    const comptime_known_element = [_]u64{0} ** 128;
+    @memset(&buf, comptime_known_element);
+    for (buf[0]) |elem| try expect(elem == 0);
+    for (buf[1]) |elem| try expect(elem == 0);
+    for (buf[2]) |elem| try expect(elem == 0);
+    for (buf[3]) |elem| try expect(elem == 0);
+    for (buf[4]) |elem| try expect(elem == 0);
+}
+
 test "memcpy and memset intrinsics" {
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;