Commit c23e80e671

Jacob Young <jacobly0@users.noreply.github.com>
2023-05-09 09:15:27
x86_64: implement `@splat`
1 parent 1336619
src/arch/x86_64/CodeGen.zig
@@ -8561,7 +8561,8 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag {
                 },
                 32 => switch (ty.vectorLen()) {
                     1 => return if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov },
-                    2...4 => return if (self.hasFeature(.avx))
+                    2 => return if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov },
+                    3...4 => return if (self.hasFeature(.avx))
                         if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
                     else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
                     5...8 => if (self.hasFeature(.avx))
@@ -8577,6 +8578,14 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag {
                         return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
                     else => {},
                 },
+                128 => switch (ty.vectorLen()) {
+                    1 => return if (self.hasFeature(.avx))
+                        if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
+                    else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
+                    2 => if (self.hasFeature(.avx))
+                        return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
+                    else => {},
+                },
                 else => {},
             },
             else => {},
@@ -9939,9 +9948,200 @@ fn airErrorName(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    _ = ty_op;
-    return self.fail("TODO implement airSplat for x86_64", .{});
-    //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+    const vector_ty = self.air.typeOfIndex(inst);
+    const dst_rc = regClassForType(vector_ty);
+    const scalar_ty = vector_ty.scalarType();
+
+    const src_mcv = try self.resolveInst(ty_op.operand);
+    const result: MCValue = result: {
+        switch (scalar_ty.zigTypeTag()) {
+            else => {},
+            .Float => switch (scalar_ty.floatBits(self.target.*)) {
+                32 => switch (vector_ty.vectorLen()) {
+                    1 => {
+                        if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        try self.genSetReg(dst_reg, scalar_ty, src_mcv);
+                        break :result .{ .register = dst_reg };
+                    },
+                    2...4 => {
+                        if (self.hasFeature(.avx)) {
+                            const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                            if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                                .{ .v_ss, .broadcast },
+                                dst_reg.to128(),
+                                src_mcv.mem(.dword),
+                            ) else {
+                                const src_reg = if (src_mcv.isRegister())
+                                    src_mcv.getReg().?
+                                else
+                                    try self.copyToTmpRegister(scalar_ty, src_mcv);
+                                try self.asmRegisterRegisterRegisterImmediate(
+                                    .{ .v_ps, .shuf },
+                                    dst_reg.to128(),
+                                    src_reg.to128(),
+                                    src_reg.to128(),
+                                    Immediate.u(0),
+                                );
+                            }
+                            break :result .{ .register = dst_reg };
+                        } else {
+                            const dst_mcv = if (src_mcv.isRegister() and
+                                self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+                                src_mcv
+                            else
+                                try self.copyToRegisterWithInstTracking(inst, scalar_ty, src_mcv);
+                            const dst_reg = dst_mcv.getReg().?;
+                            try self.asmRegisterRegisterImmediate(
+                                .{ ._ps, .shuf },
+                                dst_reg.to128(),
+                                dst_reg.to128(),
+                                Immediate.u(0),
+                            );
+                            break :result dst_mcv;
+                        }
+                    },
+                    5...8 => if (self.hasFeature(.avx)) {
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                            .{ .v_ss, .broadcast },
+                            dst_reg.to256(),
+                            src_mcv.mem(.dword),
+                        ) else {
+                            const src_reg = if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(scalar_ty, src_mcv);
+                            if (self.hasFeature(.avx2)) try self.asmRegisterRegister(
+                                .{ .v_ss, .broadcast },
+                                dst_reg.to256(),
+                                src_reg.to128(),
+                            ) else {
+                                try self.asmRegisterRegisterRegisterImmediate(
+                                    .{ .v_ps, .shuf },
+                                    dst_reg.to128(),
+                                    src_reg.to128(),
+                                    src_reg.to128(),
+                                    Immediate.u(0),
+                                );
+                                try self.asmRegisterRegisterRegisterImmediate(
+                                    .{ .v_f128, .insert },
+                                    dst_reg.to256(),
+                                    dst_reg.to256(),
+                                    dst_reg.to128(),
+                                    Immediate.u(1),
+                                );
+                            }
+                        }
+                        break :result .{ .register = dst_reg };
+                    },
+                    else => {},
+                },
+                64 => switch (vector_ty.vectorLen()) {
+                    1 => {
+                        if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        try self.genSetReg(dst_reg, scalar_ty, src_mcv);
+                        break :result .{ .register = dst_reg };
+                    },
+                    2 => {
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        if (self.hasFeature(.sse3)) {
+                            if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                                if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup },
+                                dst_reg.to128(),
+                                src_mcv.mem(.qword),
+                            ) else try self.asmRegisterRegister(
+                                if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup },
+                                dst_reg.to128(),
+                                (if (src_mcv.isRegister())
+                                    src_mcv.getReg().?
+                                else
+                                    try self.copyToTmpRegister(scalar_ty, src_mcv)).to128(),
+                            );
+                            break :result .{ .register = dst_reg };
+                        } else try self.asmRegisterRegister(
+                            .{ ._ps, .movlh },
+                            dst_reg.to128(),
+                            (if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(scalar_ty, src_mcv)).to128(),
+                        );
+                    },
+                    3...4 => if (self.hasFeature(.avx)) {
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                            .{ .v_sd, .broadcast },
+                            dst_reg.to256(),
+                            src_mcv.mem(.qword),
+                        ) else {
+                            const src_reg = if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(scalar_ty, src_mcv);
+                            if (self.hasFeature(.avx2)) try self.asmRegisterRegister(
+                                .{ .v_sd, .broadcast },
+                                dst_reg.to256(),
+                                src_reg.to128(),
+                            ) else {
+                                try self.asmRegisterRegister(
+                                    .{ .v_, .movddup },
+                                    dst_reg.to128(),
+                                    src_reg.to128(),
+                                );
+                                try self.asmRegisterRegisterRegisterImmediate(
+                                    .{ .v_f128, .insert },
+                                    dst_reg.to256(),
+                                    dst_reg.to256(),
+                                    dst_reg.to128(),
+                                    Immediate.u(1),
+                                );
+                            }
+                        }
+                        break :result .{ .register = dst_reg };
+                    },
+                    else => {},
+                },
+                128 => switch (vector_ty.vectorLen()) {
+                    1 => {
+                        if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        try self.genSetReg(dst_reg, scalar_ty, src_mcv);
+                        break :result .{ .register = dst_reg };
+                    },
+                    2 => if (self.hasFeature(.avx)) {
+                        const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
+                        if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                            .{ .v_f128, .broadcast },
+                            dst_reg.to256(),
+                            src_mcv.mem(.xword),
+                        ) else {
+                            const src_reg = if (src_mcv.isRegister())
+                                src_mcv.getReg().?
+                            else
+                                try self.copyToTmpRegister(scalar_ty, src_mcv);
+                            try self.asmRegisterRegisterRegisterImmediate(
+                                .{ .v_f128, .insert },
+                                dst_reg.to256(),
+                                src_reg.to256(),
+                                src_reg.to128(),
+                                Immediate.u(1),
+                            );
+                        }
+                        break :result .{ .register = dst_reg };
+                    },
+                    else => {},
+                },
+                16, 80 => {},
+                else => unreachable,
+            },
+        }
+        return self.fail("TODO implement airSplat for {}", .{
+            vector_ty.fmt(self.bin_file.options.module.?),
+        });
+    };
+    return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
 fn airSelect(self: *Self, inst: Air.Inst.Index) !void {
src/arch/x86_64/Encoding.zig
@@ -270,10 +270,12 @@ pub const Mnemonic = enum {
     divps, divss,
     maxps, maxss,
     minps, minss,
-    movaps, movhlps, movss, movups,
+    movaps, movhlps, movlhps,
+    movss, movups,
     mulps, mulss,
     orps,
     pextrw, pinsrw,
+    shufps,
     sqrtps, sqrtss,
     subps, subss,
     ucomiss,
@@ -296,6 +298,7 @@ pub const Mnemonic = enum {
     psrld, psrlq, psrlw,
     punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
     punpcklbw, punpckldq, punpcklqdq, punpcklwd,
+    shufpd,
     sqrtpd, sqrtsd,
     subpd, subsd,
     ucomisd,
@@ -303,17 +306,22 @@ pub const Mnemonic = enum {
     // SSE3
     movddup, movshdup, movsldup,
     // SSE4.1
+    extractps,
+    insertps,
     pextrb, pextrd, pextrq,
     pinsrb, pinsrd, pinsrq,
     roundpd, roundps, roundsd, roundss,
     // AVX
     vaddpd, vaddps, vaddsd, vaddss,
+    vbroadcastf128, vbroadcastsd, vbroadcastss,
     vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd,
     vdivpd, vdivps, vdivsd, vdivss,
+    vextractf128, vextractps,
+    vinsertf128, vinsertps,
     vmaxpd, vmaxps, vmaxsd, vmaxss,
     vminpd, vminps, vminsd, vminss,
     vmovapd, vmovaps,
-    vmovddup, vmovhlps,
+    vmovddup, vmovhlps, vmovlhps,
     vmovsd,
     vmovshdup, vmovsldup,
     vmovss,
@@ -326,6 +334,7 @@ pub const Mnemonic = enum {
     vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
     vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd,
     vroundpd, vroundps, vroundsd, vroundss,
+    vshufpd, vshufps,
     vsqrtpd, vsqrtps, vsqrtsd, vsqrtss,
     vsubpd, vsubps, vsubsd, vsubss,
     // F16C
src/arch/x86_64/encodings.zig
@@ -867,6 +867,8 @@ pub const table = [_]Entry{
 
     .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
 
+    .{ .movlhps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .none, .sse },
+
     .{ .movss, .rm, &.{ .xmm,     .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse },
     .{ .movss, .mr, &.{ .xmm_m32, .xmm     }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse },
 
@@ -879,14 +881,16 @@ pub const table = [_]Entry{
 
     .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse },
 
-    .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse },
-
-    .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse },
+    .{ .shufps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .none, .sse },
 
     .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },
 
     .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .none, .sse },
 
+    .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse },
+
+    .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse },
+
     .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .none, .sse },
 
     .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse },
@@ -967,6 +971,8 @@ pub const table = [_]Entry{
     .{ .punpckldq,  .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 },
     .{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 },
 
+    .{ .shufpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .none, .sse2 },
+
     .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 },
 
     .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 },
@@ -990,6 +996,10 @@ pub const table = [_]Entry{
     .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 },
 
     // SSE4.1
+    .{ .extractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .none, .sse4_1 },
+
+    .{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 },
+
     .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
     .{ .pextrd, .mri, &.{ .rm32,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 },
     .{ .pextrq, .mri, &.{ .rm64,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 },
@@ -1019,6 +1029,11 @@ pub const table = [_]Entry{
 
     .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
 
+    .{ .vbroadcastss,   .rm, &.{ .xmm, .m32  }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx },
+    .{ .vbroadcastss,   .rm, &.{ .ymm, .m32  }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx },
+    .{ .vbroadcastsd,   .rm, &.{ .ymm, .m64  }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx },
+    .{ .vbroadcastf128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x1a }, 0, .vex_256_w0, .avx },
+
     .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx },
 
     .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx },
@@ -1039,6 +1054,14 @@ pub const table = [_]Entry{
 
     .{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx },
 
+    .{ .vextractf128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x19 }, 0, .vex_256_w0, .avx },
+
+    .{ .vextractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .vex_128_wig, .avx },
+
+    .{ .vinsertf128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x18 }, 0, .vex_256_w0, .avx },
+
+    .{ .vinsertps, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .vex_128_wig, .avx },
+
     .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
     .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
 
@@ -1074,6 +1097,8 @@ pub const table = [_]Entry{
 
     .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
 
+    .{ .vmovlhps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .vex_128_wig, .avx },
+
     .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
     .{ .vmovsd, .rm,  &.{       .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
     .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx },
@@ -1150,6 +1175,12 @@ pub const table = [_]Entry{
 
     .{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx },
 
+    .{ .vshufpd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .vex_128_wig, .avx },
+    .{ .vshufpd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .vex_256_wig, .avx },
+
+    .{ .vshufps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .vex_128_wig, .avx },
+    .{ .vshufps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .vex_256_wig, .avx },
+
     .{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx },
     .{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx },
 
@@ -1201,6 +1232,10 @@ pub const table = [_]Entry{
     .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w0, .fma },
 
     // AVX2
+    .{ .vbroadcastss,   .rm, &.{ .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx2 },
+    .{ .vbroadcastss,   .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
+    .{ .vbroadcastsd,   .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 },
+
     .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 },
     .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 },
     .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 },
src/arch/x86_64/Lower.zig
@@ -300,6 +300,8 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
         else
             .none,
     }, mnemonic: {
+        @setEvalBranchQuota(2_000);
+
         comptime var max_len = 0;
         inline for (@typeInfo(Mnemonic).Enum.fields) |field| max_len = @max(field.name.len, max_len);
         var buf: [max_len]u8 = undefined;
src/arch/x86_64/Mir.zig
@@ -256,6 +256,8 @@ pub const Inst = struct {
         v_sd,
         /// VEX-Encoded ___ Packed Double-Precision Values
         v_pd,
+        /// VEX-Encoded ___ 128-Bits Of Floating-Point Data
+        v_f128,
 
         /// Mask ___ Byte
         k_b,
@@ -454,6 +456,8 @@ pub const Inst = struct {
         mova,
         /// Move packed single-precision floating-point values high to low
         movhl,
+        /// Move packed single-precision floating-point values low to high
+        movlh,
         /// Move unaligned packed single-precision floating-point values
         /// Move unaligned packed double-precision floating-point values
         movu,
@@ -488,6 +492,9 @@ pub const Inst = struct {
         cvtsi2sd,
         /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
         cvtss2sd,
+        /// Packed interleave shuffle of quadruplets of single-precision floating-point values
+        /// Packed interleave shuffle of pairs of double-precision floating-point values
+        shuf,
         /// Shuffle packed high words
         shufh,
         /// Shuffle packed low words
@@ -520,12 +527,20 @@ pub const Inst = struct {
         /// Replicate single floating-point values
         movsldup,
 
+        /// Extract packed floating-point values
+        extract,
+        /// Insert scalar single-precision floating-point value
+        /// Insert packed floating-point values
+        insert,
         /// Round packed single-precision floating-point values
         /// Round scalar single-precision floating-point value
         /// Round packed double-precision floating-point values
         /// Round scalar double-precision floating-point value
         round,
 
+        /// Load with broadcast floating-point data
+        broadcast,
+
         /// Convert 16-bit floating-point values to single-precision floating-point values
         cvtph2ps,
         /// Convert single-precision floating-point values to 16-bit floating-point values