Commit ff74127526

Jacob Young <jacobly0@users.noreply.github.com>
2025-02-17 11:35:57
x86_64: implement prefetch
1 parent 82eedf5
Changed files (4)
src/arch/x86_64/CodeGen.zig
@@ -2484,7 +2484,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
             .reduce           => try cg.airReduce(inst),
             .reduce_optimized => try cg.airReduce(inst),
             .aggregate_init   => try cg.airAggregateInit(inst),
-            .prefetch         => try cg.airPrefetch(inst),
             // zig fmt: on
 
             .arg => if (cg.debug_output != .none) {
@@ -76418,6 +76417,33 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
                 }, cg);
                 try res.finish(inst, &.{extra.init}, &ops, cg);
             },
+            .prefetch => {
+                const prefetch = air_datas[@intFromEnum(inst)].prefetch;
+                var ops = try cg.tempsFromOperands(inst, .{prefetch.ptr});
+                switch (prefetch.cache) {
+                    .instruction => {}, // prefetchi requires rip-relative addressing, which is currently non-trivial to emit from an arbitrary ptr value
+                    .data => if (prefetch.rw == .write and prefetch.locality <= 2 and cg.hasFeature(.prefetchwt1)) {
+                        try ops[0].toSlicePtr(cg);
+                        while (try ops[0].toLea(cg)) {}
+                        try cg.asmMemory(.{ ._wt1, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte }));
+                    } else if (prefetch.rw == .write and cg.hasFeature(.prfchw)) {
+                        try ops[0].toSlicePtr(cg);
+                        while (try ops[0].toLea(cg)) {}
+                        try cg.asmMemory(.{ ._w, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte }));
+                    } else if (cg.hasFeature(.sse) or cg.hasFeature(.prfchw) or cg.hasFeature(.prefetchi) or cg.hasFeature(.prefetchwt1)) {
+                        try ops[0].toSlicePtr(cg);
+                        while (try ops[0].toLea(cg)) {}
+                        switch (prefetch.locality) {
+                            0 => try cg.asmMemory(.{ ._nta, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })),
+                            1 => try cg.asmMemory(.{ ._t2, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })),
+                            2 => try cg.asmMemory(.{ ._t1, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })),
+                            3 => try cg.asmMemory(.{ ._t0, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })),
+                        }
+                    },
+                }
+                const res = try cg.tempInit(.void, .none);
+                try res.finish(inst, &.{prefetch.ptr}, &ops, cg);
+            },
             .mul_add => |air_tag| if (use_old) try cg.airMulAdd(inst) else {
                 const pl_op = air_datas[@intFromEnum(inst)].pl_op;
                 const bin_op = cg.air.extraData(Air.Bin, pl_op.payload).data;
@@ -94743,11 +94769,6 @@ fn airUnionInit(self: *CodeGen, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, result, .{ extra.init, .none, .none });
 }
 
-fn airPrefetch(self: *CodeGen, inst: Air.Inst.Index) !void {
-    const prefetch = self.air.instructions.items(.data)[@intFromEnum(inst)].prefetch;
-    return self.finishAir(inst, .unreach, .{ prefetch.ptr, .none, .none });
-}
-
 fn airMulAdd(self: *CodeGen, inst: Air.Inst.Index) !void {
     const pt = self.pt;
     const zcu = pt.zcu;
src/arch/x86_64/Encoding.zig
@@ -78,7 +78,7 @@ pub fn findByMnemonic(
                 ),
                 .x86_64 => false,
             },
-            inline .@"invpcid 64bit", .@"rdpid 64bit" => |tag| switch (target.cpu.arch) {
+            inline .@"invpcid 64bit", .@"rdpid 64bit", .@"prefetchi 64bit" => |tag| switch (target.cpu.arch) {
                 else => unreachable,
                 .x86 => false,
                 .x86_64 => std.Target.x86.featureSetHas(
@@ -86,6 +86,7 @@ pub fn findByMnemonic(
                     @field(std.Target.x86.Feature, @tagName(tag)[0 .. @tagName(tag).len - " 64bit".len]),
                 ),
             },
+            .prefetch => std.Target.x86.featureSetHasAny(target.cpu.features, .{ .sse, .prfchw, .prefetchi, .prefetchwt1 }),
             inline else => |tag| has_features: {
                 comptime var feature_it = std.mem.splitScalar(u8, @tagName(tag), ' ');
                 comptime var features: []const std.Target.x86.Feature = &.{};
@@ -375,6 +376,7 @@ pub const Mnemonic = enum {
     orps,
     pextrw, pinsrw,
     pmaxsw, pmaxub, pminsw, pminub, pmovmskb,
+    prefetchit0, prefetchit1, prefetchnta, prefetcht0, prefetcht1, prefetcht2, prefetchw, prefetchwt1,
     shufps,
     sqrtps, sqrtss,
     stmxcsr,
@@ -562,8 +564,7 @@ pub const Op = enum {
     r32_m8, r32_m16, r64_m16,
     m8, m16, m32, m64, m80, m128, m256,
     rel8, rel16, rel32,
-    m,
-    moffs,
+    m, moffs, mrip8,
     sreg,
     st0, st, mm, mm_m64,
     xmm0, xmm, xmm_m8, xmm_m16, xmm_m32, xmm_m64, xmm_m128,
@@ -617,7 +618,7 @@ pub const Op = enum {
 
             .mem => |mem| switch (mem) {
                 .moffs => .moffs,
-                .sib, .rip => switch (mem.bitSize(target)) {
+                .sib => switch (mem.bitSize(target)) {
                     0 => .m,
                     8 => .m8,
                     16 => .m16,
@@ -628,6 +629,16 @@ pub const Op = enum {
                     256 => .m256,
                     else => unreachable,
                 },
+                .rip => switch (mem.bitSize(target)) {
+                    0, 8 => .mrip8,
+                    16 => .m16,
+                    32 => .m32,
+                    64 => .m64,
+                    80 => .m80,
+                    128 => .m128,
+                    256 => .m256,
+                    else => unreachable,
+                },
             },
 
             .imm => |imm| switch (imm) {
@@ -680,7 +691,7 @@ pub const Op = enum {
 
     pub fn immBitSize(op: Op) u64 {
         return switch (op) {
-            .none, .moffs, .m, .sreg => unreachable,
+            .none, .m, .moffs, .mrip8, .sreg => unreachable,
             .al, .cl, .dx, .rip, .eip, .ip, .r8, .rm8, .r32_m8 => unreachable,
             .ax, .r16, .rm16 => unreachable,
             .eax, .r32, .rm32, .r32_m16 => unreachable,
@@ -700,7 +711,7 @@ pub const Op = enum {
 
     pub fn regBitSize(op: Op) u64 {
         return switch (op) {
-            .none, .moffs, .m, .sreg => unreachable,
+            .none, .m, .moffs, .mrip8, .sreg => unreachable,
             .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
             .rel8, .rel16, .rel32 => unreachable,
             .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
@@ -716,13 +727,13 @@ pub const Op = enum {
 
     pub fn memBitSize(op: Op) u64 {
         return switch (op) {
-            .none, .moffs, .m, .sreg => unreachable,
+            .none, .m, .moffs, .sreg => unreachable,
             .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
             .rel8, .rel16, .rel32 => unreachable,
             .al, .cl, .r8, .ax, .dx, .ip, .r16, .eax, .eip, .r32, .rax, .rip, .r64 => unreachable,
             .st0, .st, .mm, .xmm0, .xmm, .ymm => unreachable,
             .cr, .dr => unreachable,
-            .m8, .rm8, .r32_m8, .xmm_m8 => 8,
+            .mrip8, .m8, .rm8, .r32_m8, .xmm_m8 => 8,
             .m16, .rm16, .r32_m16, .r64_m16, .xmm_m16 => 16,
             .m32, .rm32, .xmm_m32 => 32,
             .m64, .rm64, .mm_m64, .xmm_m64 => 64,
@@ -783,7 +794,7 @@ pub const Op = enum {
             .rm8, .rm16, .rm32, .rm64,
             .r32_m8, .r32_m16, .r64_m16,
             .m8, .m16, .m32, .m64, .m80, .m128, .m256,
-            .m,
+            .m, .moffs, .mrip8,
             .mm_m64,
             .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128,
             .ymm_m256,
@@ -821,11 +832,7 @@ pub const Op = enum {
     /// Given an operand `op` checks if `target` is a subset for the purposes of the encoding.
     pub fn isSubset(op: Op, target: Op) bool {
         switch (op) {
-            .moffs, .sreg => return op == target,
-            .none => switch (target) {
-                .none => return true,
-                else => return false,
-            },
+            .none, .m, .moffs, .sreg => return op == target,
             else => {
                 if (op.isRegister() and target.isRegister()) {
                     return switch (target.toReg()) {
@@ -836,6 +843,7 @@ pub const Op = enum {
                 if (op.isMemory() and target.isMemory()) {
                     switch (target) {
                         .m => return true,
+                        .moffs, .mrip8 => return op == target,
                         else => return op.memBitSize() == target.memBitSize(),
                     }
                 }
@@ -962,6 +970,10 @@ pub const Feature = enum {
     @"pclmul avx",
     pku,
     popcnt,
+    prefetch,
+    @"prefetchi 64bit",
+    prefetchwt1,
+    prfchw,
     rdrnd,
     rdseed,
     @"rdpid 32bit",
@@ -1002,7 +1014,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
 }
 
 const mnemonic_to_encodings_map = init: {
-    @setEvalBranchQuota(5_700);
+    @setEvalBranchQuota(5_800);
     const mnemonic_count = @typeInfo(Mnemonic).@"enum".fields.len;
     var mnemonic_map: [mnemonic_count][]Data = @splat(&.{});
     const encodings = @import("encodings.zig");
src/arch/x86_64/encodings.zig
@@ -1370,6 +1370,18 @@ pub const table = [_]Entry{
     .{ .pmovmskb, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .none, .sse },
     .{ .pmovmskb, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .none, .sse },
 
+    .{ .prefetchit0, .m, &.{ .mrip8 }, &.{ 0x0f, 0x18 }, 7, .none, .@"prefetchi 64bit" },
+    .{ .prefetchit1, .m, &.{ .mrip8 }, &.{ 0x0f, 0x18 }, 6, .none, .@"prefetchi 64bit" },
+
+    .{ .prefetchnta, .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 0, .none, .prefetch },
+    .{ .prefetcht0,  .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 1, .none, .prefetch },
+    .{ .prefetcht1,  .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 2, .none, .prefetch },
+    .{ .prefetcht2,  .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 3, .none, .prefetch },
+
+    .{ .prefetchw, .m, &.{ .m8 }, &.{ 0x0f, 0x0d }, 1, .none, .prfchw },
+
+    .{ .prefetchwt1, .m, &.{ .m8 }, &.{ 0x0f, 0x0d }, 2, .none, .prefetchwt1 },
+
     .{ .shufps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .none, .sse },
 
     .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },
src/arch/x86_64/Mir.zig
@@ -34,8 +34,18 @@ pub const Inst = struct {
         /// ___ 4
         _4,
 
+        /// ___ With NTA Hint
+        _nta,
         /// System Call ___
         sys_,
+        /// ___ With T0 Hint
+        _t0,
+        /// ___ With T1 Hint
+        _t1,
+        /// ___ With T2 Hint
+        _t2,
+        /// ___ With Intent to Write and T1 Hint
+        _wt1,
 
         /// ___ crement Shadow Stack Pointer Doubleword
         _csspd,
@@ -198,6 +208,7 @@ pub const Inst = struct {
         //_b,
         /// ___ Word
         /// ___ For Writing
+        /// ___ With Intent to Write
         _w,
         /// ___ Doubleword
         //_d,
@@ -975,6 +986,9 @@ pub const Inst = struct {
         /// Move unaligned packed single-precision floating-point values
         /// Move unaligned packed double-precision floating-point values
         movu,
+        /// Prefetch data into caches
+        /// Prefetch data into caches with intent to write
+        prefetch,
         /// Packed interleave shuffle of quadruplets of single-precision floating-point values
         /// Packed interleave shuffle of pairs of double-precision floating-point values
         /// Shuffle packed doublewords