Commit 6778da4516

Jacob Young <jacobly0@users.noreply.github.com>
2023-05-08 02:42:46
x86_64: implement binary operations for `f16` and `f16` vectors
1 parent f8708e2
src/arch/x86_64/CodeGen.zig
@@ -4497,14 +4497,15 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void {
     const tag = self.air.instructions.items(.tag)[inst];
     try self.genBinOpMir(switch (ty_bits) {
         // No point using an extra prefix byte for *pd which performs the same operation.
-        32, 64 => switch (tag) {
+        16, 32, 64, 128 => switch (tag) {
             .neg => .xorps,
             .fabs => .andnps,
             else => unreachable,
         },
-        else => return self.fail("TODO implement airFloatSign for {}", .{
+        80 => return self.fail("TODO implement airFloatSign for {}", .{
             ty.fmt(self.bin_file.options.module.?),
         }),
+        else => unreachable,
     }, vec_ty, dst_mcv, sign_mcv);
     return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none });
 }
@@ -6112,9 +6113,53 @@ fn genBinOp(
         return dst_mcv;
     }
 
+    const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
     const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
         else => unreachable,
         .Float => switch (lhs_ty.floatBits(self.target.*)) {
+            16 => if (self.hasFeature(.f16c)) {
+                const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                defer self.register_manager.unlockReg(tmp_lock);
+
+                if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                    .vpinsrw,
+                    dst_reg,
+                    dst_reg,
+                    src_mcv.mem(.word),
+                    Immediate.u(1),
+                ) else try self.asmRegisterRegisterRegister(
+                    .vpunpcklwd,
+                    dst_reg,
+                    dst_reg,
+                    (if (src_mcv.isRegister())
+                        src_mcv.getReg().?
+                    else
+                        try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+                );
+                try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+                try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg);
+                try self.asmRegisterRegisterRegister(
+                    switch (air_tag) {
+                        .add => .vaddss,
+                        .sub => .vsubss,
+                        .div_float, .div_trunc, .div_floor, .div_exact => .vdivss,
+                        .max => .vmaxss,
+                        .min => .vmaxss,
+                        else => unreachable,
+                    },
+                    dst_reg,
+                    dst_reg,
+                    tmp_reg,
+                );
+                try self.asmRegisterRegisterImmediate(
+                    .vcvtps2ph,
+                    dst_reg,
+                    dst_reg,
+                    Immediate.u(0b1_00),
+                );
+                return dst_mcv;
+            } else null,
             32 => switch (air_tag) {
                 .add => if (self.hasFeature(.avx)) .vaddss else .addss,
                 .sub => if (self.hasFeature(.avx)) .vsubss else .subss,
@@ -6141,12 +6186,178 @@ fn genBinOp(
                 .min => if (self.hasFeature(.avx)) .vminsd else .minsd,
                 else => unreachable,
             },
-            16, 80, 128 => null,
+            80, 128 => null,
             else => unreachable,
         },
         .Vector => switch (lhs_ty.childType().zigTypeTag()) {
             else => null,
             .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) {
+                    1 => {
+                        const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+                        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                        defer self.register_manager.unlockReg(tmp_lock);
+
+                        if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                            .vpinsrw,
+                            dst_reg,
+                            dst_reg,
+                            src_mcv.mem(.word),
+                            Immediate.u(1),
+                        ) else try self.asmRegisterRegisterRegister(
+                            .vpunpcklwd,
+                            dst_reg,
+                            dst_reg,
+                            (if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+                        );
+                        try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+                        try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg);
+                        try self.asmRegisterRegisterRegister(
+                            switch (air_tag) {
+                                .add => .vaddss,
+                                .sub => .vsubss,
+                                .div_float, .div_trunc, .div_floor, .div_exact => .vdivss,
+                                .max => .vmaxss,
+                                .min => .vmaxss,
+                                else => unreachable,
+                            },
+                            dst_reg,
+                            dst_reg,
+                            tmp_reg,
+                        );
+                        try self.asmRegisterRegisterImmediate(
+                            .vcvtps2ph,
+                            dst_reg,
+                            dst_reg,
+                            Immediate.u(0b1_00),
+                        );
+                        return dst_mcv;
+                    },
+                    2 => {
+                        const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+                        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                        defer self.register_manager.unlockReg(tmp_lock);
+
+                        if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+                            .vpinsrd,
+                            dst_reg,
+                            src_mcv.mem(.dword),
+                            Immediate.u(1),
+                        ) else try self.asmRegisterRegisterRegister(
+                            .vunpcklps,
+                            dst_reg,
+                            dst_reg,
+                            (if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+                        );
+                        try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+                        try self.asmRegisterRegisterRegister(.vmovhlps, tmp_reg, dst_reg, dst_reg);
+                        try self.asmRegisterRegisterRegister(
+                            switch (air_tag) {
+                                .add => .vaddps,
+                                .sub => .vsubps,
+                                .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+                                .max => .vmaxps,
+                                .min => .vmaxps,
+                                else => unreachable,
+                            },
+                            dst_reg,
+                            dst_reg,
+                            tmp_reg,
+                        );
+                        try self.asmRegisterRegisterImmediate(
+                            .vcvtps2ph,
+                            dst_reg,
+                            dst_reg,
+                            Immediate.u(0b1_00),
+                        );
+                        return dst_mcv;
+                    },
+                    3...4 => {
+                        const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+                        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                        defer self.register_manager.unlockReg(tmp_lock);
+
+                        try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+                        if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                            .vcvtph2ps,
+                            tmp_reg,
+                            src_mcv.mem(.qword),
+                        ) else try self.asmRegisterRegister(
+                            .vcvtph2ps,
+                            tmp_reg,
+                            (if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+                        );
+                        try self.asmRegisterRegisterRegister(
+                            switch (air_tag) {
+                                .add => .vaddps,
+                                .sub => .vsubps,
+                                .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+                                .max => .vmaxps,
+                                .min => .vmaxps,
+                                else => unreachable,
+                            },
+                            dst_reg,
+                            dst_reg,
+                            tmp_reg,
+                        );
+                        try self.asmRegisterRegisterImmediate(
+                            .vcvtps2ph,
+                            dst_reg,
+                            dst_reg,
+                            Immediate.u(0b1_00),
+                        );
+                        return dst_mcv;
+                    },
+                    5...8 => {
+                        const tmp_reg = (try self.register_manager.allocReg(null, sse)).to256();
+                        const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                        defer self.register_manager.unlockReg(tmp_lock);
+
+                        try self.asmRegisterRegister(.vcvtph2ps, dst_reg.to256(), dst_reg);
+                        if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                            .vcvtph2ps,
+                            tmp_reg,
+                            src_mcv.mem(.xword),
+                        ) else try self.asmRegisterRegister(
+                            .vcvtph2ps,
+                            tmp_reg,
+                            (if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+                        );
+                        try self.asmRegisterRegisterRegister(
+                            switch (air_tag) {
+                                .add => .vaddps,
+                                .sub => .vsubps,
+                                .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+                                .max => .vmaxps,
+                                .min => .vmaxps,
+                                else => unreachable,
+                            },
+                            dst_reg.to256(),
+                            dst_reg.to256(),
+                            tmp_reg,
+                        );
+                        try self.asmRegisterRegisterImmediate(
+                            .vcvtps2ph,
+                            dst_reg,
+                            dst_reg.to256(),
+                            Immediate.u(0b1_00),
+                        );
+                        return dst_mcv;
+                    },
+                    else => null,
+                } else null,
                 32 => switch (lhs_ty.vectorLen()) {
                     1 => switch (air_tag) {
                         .add => if (self.hasFeature(.avx)) .vaddss else .addss,
@@ -6223,14 +6434,13 @@ fn genBinOp(
                     } else null,
                     else => null,
                 },
-                16, 80, 128 => null,
+                80, 128 => null,
                 else => unreachable,
             },
         },
     })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
         @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
     });
-    const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
     if (self.hasFeature(.avx)) {
         const src1_alias =
             if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
@@ -7139,21 +7349,21 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
                         const tmp2_lock = self.register_manager.lockRegAssumeUnused(tmp2_reg);
                         defer self.register_manager.unlockReg(tmp2_lock);
 
-                        if (src_mcv.isRegister())
-                            try self.asmRegisterRegisterRegister(
-                                .vpunpcklwd,
-                                tmp1_reg,
-                                dst_reg.to128(),
-                                src_mcv.getReg().?.to128(),
-                            )
-                        else
-                            try self.asmRegisterRegisterMemoryImmediate(
-                                .vpinsrw,
-                                tmp1_reg,
-                                dst_reg.to128(),
-                                src_mcv.mem(.word),
-                                Immediate.u(1),
-                            );
+                        if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+                            .vpinsrw,
+                            tmp1_reg,
+                            dst_reg.to128(),
+                            src_mcv.mem(.word),
+                            Immediate.u(1),
+                        ) else try self.asmRegisterRegisterRegister(
+                            .vpunpcklwd,
+                            tmp1_reg,
+                            dst_reg.to128(),
+                            (if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(ty, src_mcv)).to128(),
+                        );
                         try self.asmRegisterRegister(.vcvtph2ps, tmp1_reg, tmp1_reg);
                         try self.asmRegisterRegister(.vmovshdup, tmp2_reg, tmp1_reg);
                         try self.genBinOpMir(.ucomiss, ty, tmp1_mcv, tmp2_mcv);
@@ -8139,7 +8349,16 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag {
         },
         .Vector => switch (ty.childType().zigTypeTag()) {
             .Float => switch (ty.childType().floatBits(self.target.*)) {
-                16 => unreachable, // needs special handling
+                16 => switch (ty.vectorLen()) {
+                    1 => unreachable, // needs special handling
+                    2 => return if (self.hasFeature(.avx)) .vmovss else .movss,
+                    3...4 => return if (self.hasFeature(.avx)) .vmovsd else .movsd,
+                    5...8 => return if (self.hasFeature(.avx))
+                        if (aligned) .vmovaps else .vmovups
+                    else if (aligned) .movaps else .movups,
+                    9...16 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups,
+                    else => {},
+                },
                 32 => switch (ty.vectorLen()) {
                     1 => return if (self.hasFeature(.avx)) .vmovss else .movss,
                     2...4 => return if (self.hasFeature(.avx))
src/arch/x86_64/Encoding.zig
@@ -270,7 +270,7 @@ pub const Mnemonic = enum {
     divps, divss,
     maxps, maxss,
     minps, minss,
-    movaps, movss, movups,
+    movaps, movhlps, movss, movups,
     mulps, mulss,
     orps,
     pextrw, pinsrw,
@@ -303,6 +303,8 @@ pub const Mnemonic = enum {
     // SSE3
     movddup, movshdup, movsldup,
     // SSE4.1
+    pextrb, pextrd, pextrq,
+    pinsrb, pinsrd, pinsrq,
     roundpd, roundps, roundsd, roundss,
     // AVX
     vaddpd, vaddps, vaddsd, vaddss,
@@ -311,13 +313,14 @@ pub const Mnemonic = enum {
     vmaxpd, vmaxps, vmaxsd, vmaxss,
     vminpd, vminps, vminsd, vminss,
     vmovapd, vmovaps,
-    vmovddup,
+    vmovddup, vmovhlps,
     vmovsd,
     vmovshdup, vmovsldup,
     vmovss,
     vmovupd, vmovups,
     vmulpd, vmulps, vmulsd, vmulss,
-    vpextrw, vpinsrw,
+    vpextrb, vpextrd, vpextrq, vpextrw,
+    vpinsrb, vpinsrd, vpinsrq, vpinsrw,
     vpshufhw, vpshuflw,
     vpsrld, vpsrlq, vpsrlw,
     vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
@@ -359,7 +362,7 @@ pub const Op = enum {
     cl,
     r8, r16, r32, r64,
     rm8, rm16, rm32, rm64,
-    r32_m16, r64_m16,
+    r32_m8, r32_m16, r64_m16,
     m8, m16, m32, m64, m80, m128, m256,
     rel8, rel16, rel32,
     m,
@@ -444,7 +447,7 @@ pub const Op = enum {
     pub fn immBitSize(op: Op) u64 {
         return switch (op) {
             .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable,
-            .al, .cl, .r8, .rm8 => unreachable,
+            .al, .cl, .r8, .rm8, .r32_m8 => unreachable,
             .ax, .r16, .rm16 => unreachable,
             .eax, .r32, .rm32, .r32_m16 => unreachable,
             .rax, .r64, .rm64, .r64_m16 => unreachable,
@@ -467,7 +470,7 @@ pub const Op = enum {
             .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
             .al, .cl, .r8, .rm8 => 8,
             .ax, .r16, .rm16 => 16,
-            .eax, .r32, .rm32, .r32_m16 => 32,
+            .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32,
             .rax, .r64, .rm64, .r64_m16 => 64,
             .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
             .ymm, .ymm_m256 => 256,
@@ -480,7 +483,7 @@ pub const Op = enum {
             .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
             .rel8, .rel16, .rel32 => unreachable,
             .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable,
-            .m8, .rm8 => 8,
+            .m8, .rm8, .r32_m8 => 8,
             .m16, .rm16, .r32_m16, .r64_m16 => 16,
             .m32, .rm32, .xmm_m32 => 32,
             .m64, .rm64, .xmm_m64 => 64,
@@ -509,7 +512,7 @@ pub const Op = enum {
             .al, .ax, .eax, .rax,
             .r8, .r16, .r32, .r64,
             .rm8, .rm16, .rm32, .rm64,
-            .r32_m16, .r64_m16,
+            .r32_m8, .r32_m16, .r64_m16,
             .xmm, .xmm_m32, .xmm_m64, .xmm_m128,
             .ymm, .ymm_m256,
             => true,
@@ -535,7 +538,7 @@ pub const Op = enum {
         // zig fmt: off
         return switch (op) {
             .rm8, .rm16, .rm32, .rm64,
-            .r32_m16, .r64_m16,
+            .r32_m8, .r32_m16, .r64_m16,
             .m8, .m16, .m32, .m64, .m80, .m128, .m256,
             .m,
             .xmm_m32, .xmm_m64, .xmm_m128,
@@ -559,7 +562,7 @@ pub const Op = enum {
             .al, .ax, .eax, .rax, .cl => .general_purpose,
             .r8, .r16, .r32, .r64 => .general_purpose,
             .rm8, .rm16, .rm32, .rm64 => .general_purpose,
-            .r32_m16, .r64_m16 => .general_purpose,
+            .r32_m8, .r32_m16, .r64_m16 => .general_purpose,
             .sreg => .segment,
             .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point,
             .ymm, .ymm_m256 => .floating_point,
src/arch/x86_64/encodings.zig
@@ -865,6 +865,8 @@ pub const table = [_]Entry{
     .{ .movaps, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse },
     .{ .movaps, .mr, &.{ .xmm_m128, .xmm      }, &.{ 0x0f, 0x29 }, 0, .none, .sse },
 
+    .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
+
     .{ .movss, .rm, &.{ .xmm,     .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse },
     .{ .movss, .mr, &.{ .xmm_m32, .xmm     }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse },
 
@@ -988,8 +990,16 @@ pub const table = [_]Entry{
     .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 },
 
     // SSE4.1
+    .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
+    .{ .pextrd, .mri, &.{ .rm32,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 },
+    .{ .pextrq, .mri, &.{ .rm64,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 },
+
     .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },
 
+    .{ .pinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 },
+    .{ .pinsrd, .rmi, &.{ .xmm, .rm32,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 },
+    .{ .pinsrq, .rmi, &.{ .xmm, .rm64,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 },
+
     .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
 
     .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 },
@@ -1062,6 +1072,8 @@ pub const table = [_]Entry{
     .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64  }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
     .{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx },
 
+    .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
+
     .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
     .{ .vmovsd, .rm,  &.{       .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
     .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx },
@@ -1098,9 +1110,17 @@ pub const table = [_]Entry{
 
     .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
 
+    .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx },
+    .{ .vpextrd, .mri, &.{ .rm32,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
+    .{ .vpextrq, .mri, &.{ .rm64,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
+
     .{ .vpextrw, .rmi, &.{ .r32,     .xmm, .imm8 }, &.{ 0x66, 0x0f,       0x15 }, 0, .vex_128_wig, .avx },
     .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx },
 
+    .{ .vpinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
+    .{ .vpinsrd, .rmi, &.{ .xmm, .rm32,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
+    .{ .vpinsrq, .rmi, &.{ .xmm, .rm64,   .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },
+
     .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx },
 
     .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx },
src/arch/x86_64/Lower.zig
@@ -137,6 +137,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .minps,
         .minss,
         .movaps,
+        .movhlps,
         .movss,
         .movups,
         .mulps,
@@ -149,6 +150,8 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .subps,
         .subss,
         .ucomiss,
+        .unpckhps,
+        .unpcklps,
         .xorps,
 
         .addpd,
@@ -187,12 +190,20 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .subpd,
         .subsd,
         .ucomisd,
+        .unpckhpd,
+        .unpcklpd,
         .xorpd,
 
         .movddup,
         .movshdup,
         .movsldup,
 
+        .pextrb,
+        .pextrd,
+        .pextrq,
+        .pinsrb,
+        .pinsrd,
+        .pinsrq,
         .roundpd,
         .roundps,
         .roundsd,
@@ -221,6 +232,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .vmovapd,
         .vmovaps,
         .vmovddup,
+        .vmovhlps,
         .vmovsd,
         .vmovshdup,
         .vmovsldup,
@@ -231,7 +243,13 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .vmulps,
         .vmulsd,
         .vmulss,
+        .vpextrb,
+        .vpextrd,
+        .vpextrq,
         .vpextrw,
+        .vpinsrb,
+        .vpinsrd,
+        .vpinsrq,
         .vpinsrw,
         .vpshufhw,
         .vpshuflw,
@@ -258,6 +276,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .vsubps,
         .vsubsd,
         .vsubss,
+        .vunpckhpd,
+        .vunpckhps,
+        .vunpcklpd,
+        .vunpcklps,
 
         .vcvtph2ps,
         .vcvtps2ph,
src/arch/x86_64/Mir.zig
@@ -192,6 +192,8 @@ pub const Inst = struct {
         minss,
         /// Move aligned packed single-precision floating-point values
         movaps,
+        /// Move packed single-precision floating-point values high to low
+        movhlps,
         /// Move scalar single-precision floating-point value
         movss,
         /// Move unaligned packed single-precision floating-point values
@@ -216,6 +218,10 @@ pub const Inst = struct {
         subss,
         /// Unordered compare scalar single-precision floating-point values
         ucomiss,
+        /// Unpack and interleave high packed single-precision floating-point values
+        unpckhps,
+        /// Unpack and interleave low packed single-precision floating-point values
+        unpcklps,
         /// Bitwise logical xor of packed single precision floating-point values
         xorps,
 
@@ -291,6 +297,10 @@ pub const Inst = struct {
         subsd,
         /// Unordered compare scalar double-precision floating-point values
         ucomisd,
+        /// Unpack and interleave high packed double-precision floating-point values
+        unpckhpd,
+        /// Unpack and interleave low packed double-precision floating-point values
+        unpcklpd,
         /// Bitwise logical xor of packed double precision floating-point values
         xorpd,
 
@@ -301,6 +311,18 @@ pub const Inst = struct {
         /// Replicate single floating-point values
         movsldup,
 
+        /// Extract Byte
+        pextrb,
+        /// Extract Doubleword
+        pextrd,
+        /// Extract Quadword
+        pextrq,
+        /// Insert Byte
+        pinsrb,
+        /// Insert Doubleword
+        pinsrd,
+        /// Insert Quadword
+        pinsrq,
         /// Round packed double-precision floating-point values
         roundpd,
         /// Round packed single-precision floating-point values
@@ -354,6 +376,8 @@ pub const Inst = struct {
         vmovapd,
         /// Move aligned packed single-precision floating-point values
         vmovaps,
+        /// Move packed single-precision floating-point values high to low
+        vmovhlps,
         /// Replicate double floating-point values
         vmovddup,
         /// Move or merge scalar double-precision floating-point value
@@ -376,8 +400,20 @@ pub const Inst = struct {
         vmulsd,
         /// Multiply scalar single-precision floating-point values
         vmulss,
+        /// Extract Byte
+        vpextrb,
+        /// Extract Doubleword
+        vpextrd,
+        /// Extract Quadword
+        vpextrq,
         /// Extract word
         vpextrw,
+        /// Insert Byte
+        vpinsrb,
+        /// Insert Doubleword
+        vpinsrd,
+        /// Insert Quadword
+        vpinsrq,
         /// Insert word
         vpinsrw,
         /// Shuffle packed high words
@@ -430,6 +466,14 @@ pub const Inst = struct {
         vsubsd,
         /// Subtract scalar single-precision floating-point values
         vsubss,
+        /// Unpack and interleave high packed double-precision floating-point values
+        vunpckhpd,
+        /// Unpack and interleave high packed single-precision floating-point values
+        vunpckhps,
+        /// Unpack and interleave low packed double-precision floating-point values
+        vunpcklpd,
+        /// Unpack and interleave low packed single-precision floating-point values
+        vunpcklps,
 
         /// Convert 16-bit floating-point values to single-precision floating-point values
         vcvtph2ps,
test/behavior/floatop.zig
@@ -8,6 +8,8 @@ const has_f80_rt = switch (builtin.cpu.arch) {
     .x86_64, .x86 => true,
     else => false,
 };
+const no_x86_64_hardware_f16_support = builtin.zig_backend == .stage2_x86_64 and
+    !std.Target.x86.featureSetHas(builtin.cpu.features, .f16c);
 
 const epsilon_16 = 0.001;
 const epsilon = 0.000001;
@@ -52,8 +54,7 @@ fn testFloatComparisons() !void {
 }
 
 test "different sized float comparisons" {
-    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .f16c)) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -152,7 +153,7 @@ fn testSqrtWithVectors() !void {
 }
 
 test "more @sqrt f16 tests" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -202,7 +203,7 @@ fn testSqrtLegacy(comptime T: type, x: T) !void {
 }
 
 test "@sin" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -241,7 +242,7 @@ fn testSinWithVectors() !void {
 }
 
 test "@cos" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -280,7 +281,7 @@ fn testCosWithVectors() !void {
 }
 
 test "@exp" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -318,7 +319,7 @@ fn testExpWithVectors() !void {
 }
 
 test "@exp2" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -403,7 +404,7 @@ test "@log with @vectors" {
 }
 
 test "@log2" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -445,7 +446,7 @@ fn testLog2WithVectors() !void {
 }
 
 test "@log10" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -881,7 +882,7 @@ fn testTruncLegacy(comptime T: type, x: T) !void {
 }
 
 test "negation f16" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -1040,7 +1041,6 @@ test "comptime_float zero divided by zero produces zero" {
 }
 
 test "nan negation f16" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
test/behavior/muladd.zig
@@ -2,11 +2,11 @@ const std = @import("std");
 const builtin = @import("builtin");
 const expect = std.testing.expect;
 
-const stage2_x86_64_without_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and
+const no_x86_64_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and
     !std.Target.x86.featureSetHas(builtin.cpu.features, .fma);
 
 test "@mulAdd" {
-    if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -120,7 +120,7 @@ fn vector32() !void {
 
 test "vector f32" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -143,7 +143,7 @@ fn vector64() !void {
 
 test "vector f64" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
+    if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO