Commit 513c4c145e

Jacob Young <jacobly0@users.noreply.github.com>
2024-02-16 07:59:16
x86_64: fix avx2 `@truncacte`
1 parent 2fcb2f5
Changed files (4)
src/arch/x86_64/CodeGen.zig
@@ -3274,8 +3274,8 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
             try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
             break :dst dst_mcv;
         } else dst: {
-            const dst_mcv = try self.allocRegOrMem(inst, true);
-            try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
+            const dst_mcv = try self.allocRegOrMemAdvanced(src_ty, inst, true);
+            try self.genCopy(src_ty, dst_mcv, src_mcv, .{});
             break :dst dst_mcv;
         };
 
@@ -3333,22 +3333,40 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
                 else => .{ .register = try self.copyToTmpRegister(Type.usize, splat_mcv.address()) },
             };
 
-            const dst_reg = registerAlias(dst_mcv.getReg().?, src_abi_size);
+            const dst_reg = dst_mcv.getReg().?;
+            const dst_alias = registerAlias(dst_reg, src_abi_size);
             if (self.hasFeature(.avx)) {
                 try self.asmRegisterRegisterMemory(
                     .{ .vp_, .@"and" },
-                    dst_reg,
-                    dst_reg,
+                    dst_alias,
+                    dst_alias,
                     try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
                 );
-                try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg);
+                if (src_abi_size > 16) {
+                    const temp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
+                    const temp_lock = self.register_manager.lockRegAssumeUnused(temp_reg);
+                    defer self.register_manager.unlockReg(temp_lock);
+
+                    try self.asmRegisterRegisterImmediate(
+                        .{ if (self.hasFeature(.avx2)) .v_i128 else .v_f128, .extract },
+                        registerAlias(temp_reg, dst_abi_size),
+                        dst_alias,
+                        Immediate.u(1),
+                    );
+                    try self.asmRegisterRegisterRegister(
+                        mir_tag,
+                        registerAlias(dst_reg, dst_abi_size),
+                        registerAlias(dst_reg, dst_abi_size),
+                        registerAlias(temp_reg, dst_abi_size),
+                    );
+                } else try self.asmRegisterRegisterRegister(mir_tag, dst_alias, dst_alias, dst_alias);
             } else {
                 try self.asmRegisterMemory(
                     .{ .p_, .@"and" },
-                    dst_reg,
+                    dst_alias,
                     try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
                 );
-                try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg);
+                try self.asmRegisterRegister(mir_tag, dst_alias, dst_alias);
             }
             break :result dst_mcv;
         }
@@ -16404,7 +16422,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     },
                     65...128 => switch (vector_len) {
                         else => null,
-                        1...2 => .{ .vp_i128, .broadcast },
+                        1...2 => .{ .v_i128, .broadcast },
                     },
                 }) orelse break :avx2;
 
@@ -16418,7 +16436,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
                     registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod))),
                     try src_mcv.mem(self, self.memSize(scalar_ty)),
                 ) else {
-                    if (mir_tag[0] == .vp_i128) break :avx2;
+                    if (mir_tag[0] == .v_i128) break :avx2;
                     try self.genSetReg(dst_reg, scalar_ty, src_mcv, .{});
                     try self.asmRegisterRegister(
                         mir_tag,
src/arch/x86_64/Encoding.zig
@@ -415,7 +415,8 @@ pub const Mnemonic = enum {
     vfmadd132sd, vfmadd213sd, vfmadd231sd,
     vfmadd132ss, vfmadd213ss, vfmadd231ss,
     // AVX2
-    vpbroadcastb, vpbroadcastd, vpbroadcasti128, vpbroadcastq, vpbroadcastw,
+    vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
+    vextracti128, vinserti128,
     // zig fmt: on
 };
 
src/arch/x86_64/encodings.zig
@@ -1769,6 +1769,10 @@ pub const table = [_]Entry{
     .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
     .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 },
 
+    .{ .vextracti128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x39 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vinserti128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x38 }, 0, .vex_256_w0, .avx2 },
+
     .{ .vpabsb, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1c }, 0, .vex_256_wig, .avx2 },
     .{ .vpabsd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1e }, 0, .vex_256_wig, .avx2 },
     .{ .vpabsw, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1d }, 0, .vex_256_wig, .avx2 },
@@ -1809,7 +1813,7 @@ pub const table = [_]Entry{
     .{ .vpbroadcastd,    .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_256_w0, .avx2 },
     .{ .vpbroadcastq,    .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_128_w0, .avx2 },
     .{ .vpbroadcastq,    .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_256_w0, .avx2 },
-    .{ .vpbroadcasti128, .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
+    .{ .vbroadcasti128,  .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
 
     .{ .vpcmpeqb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_256_wig, .avx2 },
     .{ .vpcmpeqw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_256_wig, .avx2 },
src/arch/x86_64/Mir.zig
@@ -230,6 +230,8 @@ pub const Inst = struct {
         v_d,
         /// VEX-Encoded ___ QuadWord
         v_q,
+        /// VEX-Encoded ___ Integer Data
+        v_i128,
         /// VEX-Encoded Packed ___
         vp_,
         /// VEX-Encoded Packed ___ Byte
@@ -242,8 +244,6 @@ pub const Inst = struct {
         vp_q,
         /// VEX-Encoded Packed ___ Double Quadword
         vp_dq,
-        /// VEX-Encoded Packed ___ Integer Data
-        vp_i128,
         /// VEX-Encoded ___ Scalar Single-Precision Values
         v_ss,
         /// VEX-Encoded ___ Packed Single-Precision Values
@@ -654,6 +654,7 @@ pub const Inst = struct {
         /// Variable blend scalar double-precision floating-point values
         blendv,
         /// Extract packed floating-point values
+        /// Extract packed integer values
         extract,
         /// Insert scalar single-precision floating-point value
         /// Insert packed floating-point values
@@ -696,6 +697,7 @@ pub const Inst = struct {
         sha256rnds2,
 
         /// Load with broadcast floating-point data
+        /// Load integer and broadcast
         broadcast,
 
         /// Convert 16-bit floating-point values to single-precision floating-point values