Commit 1f5aa7747f

Jacob Young <jacobly0@users.noreply.github.com>
2023-05-08 13:35:31
x86_64: finish optimizing mir tag usage
Final tag count is 95.
1 parent ecb5fea
Changed files (2)
src
arch
src/arch/x86_64/CodeGen.zig
@@ -2443,7 +2443,7 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
                 else
                     try self.copyToTmpRegister(src_ty, src_mcv);
                 try self.asmRegisterRegisterImmediate(
-                    .{ ._, .vcvtps2ph },
+                    .{ .v_, .cvtps2ph },
                     dst_reg,
                     mat_src_reg.to128(),
                     Immediate.u(0b1_00),
@@ -2455,12 +2455,12 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
         }
     } else if (src_bits == 64 and dst_bits == 32) {
         if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
-            .{ ._, .vcvtsd2ss },
+            .{ .v_, .cvtsd2ss },
             dst_reg,
             dst_reg,
             src_mcv.mem(.qword),
         ) else try self.asmRegisterRegisterRegister(
-            .{ ._, .vcvtsd2ss },
+            .{ .v_, .cvtsd2ss },
             dst_reg,
             dst_reg,
             (if (src_mcv.isRegister())
@@ -2506,22 +2506,22 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
             src_mcv.getReg().?
         else
             try self.copyToTmpRegister(src_ty, src_mcv);
-        try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg, mat_src_reg.to128());
+        try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, mat_src_reg.to128());
         switch (dst_bits) {
             32 => {},
-            64 => try self.asmRegisterRegisterRegister(.{ ._, .vcvtss2sd }, dst_reg, dst_reg, dst_reg),
+            64 => try self.asmRegisterRegisterRegister(.{ .v_, .cvtss2sd }, dst_reg, dst_reg, dst_reg),
             else => return self.fail("TODO implement airFpext from {} to {}", .{
                 src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
             }),
         }
     } else if (src_bits == 32 and dst_bits == 64) {
         if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
-            .{ ._, .vcvtss2sd },
+            .{ .v_, .cvtss2sd },
             dst_reg,
             dst_reg,
             src_mcv.mem(.dword),
         ) else try self.asmRegisterRegisterRegister(
-            .{ ._, .vcvtss2sd },
+            .{ .v_, .cvtss2sd },
             dst_reg,
             dst_reg,
             (if (src_mcv.isRegister())
@@ -4678,8 +4678,8 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void {
     try self.genBinOpMir(switch (ty_bits) {
         // No point using an extra prefix byte for *pd which performs the same operation.
         16, 32, 64, 128 => switch (tag) {
-            .neg => .{ ._, .xorps },
-            .fabs => .{ ._, .andnps },
+            .neg => .{ ._ps, .xor },
+            .fabs => .{ ._ps, .andn },
             else => unreachable,
         },
         80 => return self.fail("TODO implement airFloatSign for {}", .{
@@ -4712,23 +4712,23 @@ fn genRound(self: *Self, ty: Type, dst_reg: Register, src_mcv: MCValue, mode: u4
 
     const mir_tag = if (@as(?Mir.Inst.FixedTag, switch (ty.zigTypeTag()) {
         .Float => switch (ty.floatBits(self.target.*)) {
-            32 => if (self.hasFeature(.avx)) .{ ._, .vroundss } else .{ ._, .roundss },
-            64 => if (self.hasFeature(.avx)) .{ ._, .vroundsd } else .{ ._, .roundsd },
+            32 => if (self.hasFeature(.avx)) .{ .v_ss, .round } else .{ ._ss, .round },
+            64 => if (self.hasFeature(.avx)) .{ .v_sd, .round } else .{ ._sd, .round },
             16, 80, 128 => null,
             else => unreachable,
         },
         .Vector => switch (ty.childType().zigTypeTag()) {
             .Float => switch (ty.childType().floatBits(self.target.*)) {
                 32 => switch (ty.vectorLen()) {
-                    1 => if (self.hasFeature(.avx)) .{ ._, .vroundss } else .{ ._, .roundss },
-                    2...4 => if (self.hasFeature(.avx)) .{ ._, .vroundps } else .{ ._, .roundps },
-                    5...8 => if (self.hasFeature(.avx)) .{ ._, .vroundps } else null,
+                    1 => if (self.hasFeature(.avx)) .{ .v_ss, .round } else .{ ._ss, .round },
+                    2...4 => if (self.hasFeature(.avx)) .{ .v_ps, .round } else .{ ._ps, .round },
+                    5...8 => if (self.hasFeature(.avx)) .{ .v_ps, .round } else null,
                     else => null,
                 },
                 64 => switch (ty.vectorLen()) {
-                    1 => if (self.hasFeature(.avx)) .{ ._, .vroundsd } else .{ ._, .roundsd },
-                    2 => if (self.hasFeature(.avx)) .{ ._, .vroundpd } else .{ ._, .roundpd },
-                    3...4 => if (self.hasFeature(.avx)) .{ ._, .vroundpd } else null,
+                    1 => if (self.hasFeature(.avx)) .{ .v_sd, .round } else .{ ._sd, .round },
+                    2 => if (self.hasFeature(.avx)) .{ .v_pd, .round } else .{ ._pd, .round },
+                    3...4 => if (self.hasFeature(.avx)) .{ .v_pd, .round } else null,
                     else => null,
                 },
                 16, 80, 128 => null,
@@ -4743,8 +4743,8 @@ fn genRound(self: *Self, ty: Type, dst_reg: Register, src_mcv: MCValue, mode: u4
 
     const abi_size = @intCast(u32, ty.abiSize(self.target.*));
     const dst_alias = registerAlias(dst_reg, abi_size);
-    switch (mir_tag[1]) {
-        .vroundss, .vroundsd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+    switch (mir_tag[0]) {
+        .v_ss, .v_sd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
             mir_tag,
             dst_alias,
             dst_alias,
@@ -4799,18 +4799,18 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                         src_mcv.getReg().?
                     else
                         try self.copyToTmpRegister(ty, src_mcv);
-                    try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg, mat_src_reg.to128());
-                    try self.asmRegisterRegisterRegister(.{ ._, .vsqrtss }, dst_reg, dst_reg, dst_reg);
+                    try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, mat_src_reg.to128());
+                    try self.asmRegisterRegisterRegister(.{ .v_ss, .sqrt }, dst_reg, dst_reg, dst_reg);
                     try self.asmRegisterRegisterImmediate(
-                        .{ ._, .vcvtps2ph },
+                        .{ .v_, .cvtps2ph },
                         dst_reg,
                         dst_reg,
                         Immediate.u(0b1_00),
                     );
                     break :result dst_mcv;
                 } else null,
-                32 => if (self.hasFeature(.avx)) .{ ._, .vsqrtss } else .{ ._, .sqrtss },
-                64 => if (self.hasFeature(.avx)) .{ ._, .vsqrtsd } else .{ ._, .sqrtsd },
+                32 => if (self.hasFeature(.avx)) .{ .v_ss, .sqrt } else .{ ._ss, .sqrt },
+                64 => if (self.hasFeature(.avx)) .{ .v_sd, .sqrt } else .{ ._sd, .sqrt },
                 80, 128 => null,
                 else => unreachable,
             },
@@ -4819,7 +4819,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                     16 => if (self.hasFeature(.f16c)) switch (ty.vectorLen()) {
                         1 => {
                             try self.asmRegisterRegister(
-                                .{ ._, .vcvtph2ps },
+                                .{ .v_, .cvtph2ps },
                                 dst_reg,
                                 (if (src_mcv.isRegister())
                                     src_mcv.getReg().?
@@ -4827,13 +4827,13 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                                     try self.copyToTmpRegister(ty, src_mcv)).to128(),
                             );
                             try self.asmRegisterRegisterRegister(
-                                .{ ._, .vsqrtss },
+                                .{ .v_ss, .sqrt },
                                 dst_reg,
                                 dst_reg,
                                 dst_reg,
                             );
                             try self.asmRegisterRegisterImmediate(
-                                .{ ._, .vcvtps2ph },
+                                .{ .v_, .cvtps2ph },
                                 dst_reg,
                                 dst_reg,
                                 Immediate.u(0b1_00),
@@ -4843,22 +4843,22 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                         2...8 => {
                             const wide_reg = registerAlias(dst_reg, abi_size * 2);
                             if (src_mcv.isMemory()) try self.asmRegisterMemory(
-                                .{ ._, .vcvtph2ps },
+                                .{ .v_, .cvtph2ps },
                                 wide_reg,
                                 src_mcv.mem(Memory.PtrSize.fromSize(
                                     @intCast(u32, @divExact(wide_reg.bitSize(), 16)),
                                 )),
                             ) else try self.asmRegisterRegister(
-                                .{ ._, .vcvtph2ps },
+                                .{ .v_, .cvtph2ps },
                                 wide_reg,
                                 (if (src_mcv.isRegister())
                                     src_mcv.getReg().?
                                 else
                                     try self.copyToTmpRegister(ty, src_mcv)).to128(),
                             );
-                            try self.asmRegisterRegister(.{ ._, .vsqrtps }, wide_reg, wide_reg);
+                            try self.asmRegisterRegister(.{ .v_ps, .sqrt }, wide_reg, wide_reg);
                             try self.asmRegisterRegisterImmediate(
-                                .{ ._, .vcvtps2ph },
+                                .{ .v_, .cvtps2ph },
                                 dst_reg,
                                 wide_reg,
                                 Immediate.u(0b1_00),
@@ -4868,15 +4868,15 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                         else => null,
                     } else null,
                     32 => switch (ty.vectorLen()) {
-                        1 => if (self.hasFeature(.avx)) .{ ._, .vsqrtss } else .{ ._, .sqrtss },
-                        2...4 => if (self.hasFeature(.avx)) .{ ._, .vsqrtps } else .{ ._, .sqrtps },
-                        5...8 => if (self.hasFeature(.avx)) .{ ._, .vsqrtps } else null,
+                        1 => if (self.hasFeature(.avx)) .{ .v_ss, .sqrt } else .{ ._ss, .sqrt },
+                        2...4 => if (self.hasFeature(.avx)) .{ .v_ps, .sqrt } else .{ ._ps, .sqrt },
+                        5...8 => if (self.hasFeature(.avx)) .{ .v_ps, .sqrt } else null,
                         else => null,
                     },
                     64 => switch (ty.vectorLen()) {
-                        1 => if (self.hasFeature(.avx)) .{ ._, .vsqrtsd } else .{ ._, .sqrtsd },
-                        2 => if (self.hasFeature(.avx)) .{ ._, .vsqrtpd } else .{ ._, .sqrtpd },
-                        3...4 => if (self.hasFeature(.avx)) .{ ._, .vsqrtpd } else null,
+                        1 => if (self.hasFeature(.avx)) .{ .v_sd, .sqrt } else .{ ._sd, .sqrt },
+                        2 => if (self.hasFeature(.avx)) .{ .v_pd, .sqrt } else .{ ._pd, .sqrt },
+                        3...4 => if (self.hasFeature(.avx)) .{ .v_pd, .sqrt } else null,
                         else => null,
                     },
                     80, 128 => null,
@@ -4888,8 +4888,8 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
         })) |tag| tag else return self.fail("TODO implement airSqrt for {}", .{
             ty.fmt(self.bin_file.options.module.?),
         });
-        switch (mir_tag[1]) {
-            .vsqrtss, .vsqrtsd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+        switch (mir_tag[0]) {
+            .v_ss, .v_sd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
                 mir_tag,
                 dst_reg,
                 dst_reg,
@@ -6325,13 +6325,13 @@ fn genBinOp(
                 defer self.register_manager.unlockReg(tmp_lock);
 
                 if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
-                    .{ ._, .vpinsrw },
+                    .{ .vp_w, .insr },
                     dst_reg,
                     dst_reg,
                     src_mcv.mem(.word),
                     Immediate.u(1),
                 ) else try self.asmRegisterRegisterRegister(
-                    .{ ._, .vpunpcklwd },
+                    .{ .vp_, .unpcklwd },
                     dst_reg,
                     dst_reg,
                     (if (src_mcv.isRegister())
@@ -6339,15 +6339,15 @@ fn genBinOp(
                     else
                         try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
                 );
-                try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg, dst_reg);
-                try self.asmRegisterRegister(.{ ._, .vmovshdup }, tmp_reg, dst_reg);
+                try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg);
+                try self.asmRegisterRegister(.{ .v_, .movshdup }, tmp_reg, dst_reg);
                 try self.asmRegisterRegisterRegister(
                     switch (air_tag) {
-                        .add => .{ ._, .vaddss },
-                        .sub => .{ ._, .vsubss },
-                        .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivss },
-                        .max => .{ ._, .vmaxss },
-                        .min => .{ ._, .vmaxss },
+                        .add => .{ .v_ss, .add },
+                        .sub => .{ .v_ss, .sub },
+                        .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ss, .div },
+                        .max => .{ .v_ss, .max },
+                        .min => .{ .v_ss, .max },
                         else => unreachable,
                     },
                     dst_reg,
@@ -6355,7 +6355,7 @@ fn genBinOp(
                     tmp_reg,
                 );
                 try self.asmRegisterRegisterImmediate(
-                    .{ ._, .vcvtps2ph },
+                    .{ .v_, .cvtps2ph },
                     dst_reg,
                     dst_reg,
                     Immediate.u(0b1_00),
@@ -6363,29 +6363,29 @@ fn genBinOp(
                 return dst_mcv;
             } else null,
             32 => switch (air_tag) {
-                .add => if (self.hasFeature(.avx)) .{ ._, .vaddss } else .{ ._, .addss },
-                .sub => if (self.hasFeature(.avx)) .{ ._, .vsubss } else .{ ._, .subss },
-                .mul => if (self.hasFeature(.avx)) .{ ._, .vmulss } else .{ ._, .mulss },
+                .add => if (self.hasFeature(.avx)) .{ .v_ss, .add } else .{ ._ss, .add },
+                .sub => if (self.hasFeature(.avx)) .{ .v_ss, .sub } else .{ ._ss, .sub },
+                .mul => if (self.hasFeature(.avx)) .{ .v_ss, .mul } else .{ ._ss, .mul },
                 .div_float,
                 .div_trunc,
                 .div_floor,
                 .div_exact,
-                => if (self.hasFeature(.avx)) .{ ._, .vdivss } else .{ ._, .divss },
-                .max => if (self.hasFeature(.avx)) .{ ._, .vmaxss } else .{ ._, .maxss },
-                .min => if (self.hasFeature(.avx)) .{ ._, .vminss } else .{ ._, .minss },
+                => if (self.hasFeature(.avx)) .{ .v_ss, .div } else .{ ._ss, .div },
+                .max => if (self.hasFeature(.avx)) .{ .v_ss, .max } else .{ ._ss, .max },
+                .min => if (self.hasFeature(.avx)) .{ .v_ss, .min } else .{ ._ss, .min },
                 else => unreachable,
             },
             64 => switch (air_tag) {
-                .add => if (self.hasFeature(.avx)) .{ ._, .vaddsd } else .{ ._, .addsd },
-                .sub => if (self.hasFeature(.avx)) .{ ._, .vsubsd } else .{ ._, .subsd },
-                .mul => if (self.hasFeature(.avx)) .{ ._, .vmulsd } else .{ ._, .mulsd },
+                .add => if (self.hasFeature(.avx)) .{ .v_sd, .add } else .{ ._sd, .add },
+                .sub => if (self.hasFeature(.avx)) .{ .v_sd, .sub } else .{ ._sd, .sub },
+                .mul => if (self.hasFeature(.avx)) .{ .v_sd, .mul } else .{ ._sd, .mul },
                 .div_float,
                 .div_trunc,
                 .div_floor,
                 .div_exact,
-                => if (self.hasFeature(.avx)) .{ ._, .vdivsd } else .{ ._, .divsd },
-                .max => if (self.hasFeature(.avx)) .{ ._, .vmaxsd } else .{ ._, .maxsd },
-                .min => if (self.hasFeature(.avx)) .{ ._, .vminsd } else .{ ._, .minsd },
+                => if (self.hasFeature(.avx)) .{ .v_sd, .div } else .{ ._sd, .div },
+                .max => if (self.hasFeature(.avx)) .{ .v_sd, .max } else .{ ._sd, .max },
+                .min => if (self.hasFeature(.avx)) .{ .v_sd, .min } else .{ ._sd, .min },
                 else => unreachable,
             },
             80, 128 => null,
@@ -6401,13 +6401,13 @@ fn genBinOp(
                         defer self.register_manager.unlockReg(tmp_lock);
 
                         if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
-                            .{ ._, .vpinsrw },
+                            .{ .vp_w, .insr },
                             dst_reg,
                             dst_reg,
                             src_mcv.mem(.word),
                             Immediate.u(1),
                         ) else try self.asmRegisterRegisterRegister(
-                            .{ ._, .vpunpcklwd },
+                            .{ .vp_, .unpcklwd },
                             dst_reg,
                             dst_reg,
                             (if (src_mcv.isRegister())
@@ -6415,15 +6415,15 @@ fn genBinOp(
                             else
                                 try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
                         );
-                        try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg, dst_reg);
-                        try self.asmRegisterRegister(.{ ._, .vmovshdup }, tmp_reg, dst_reg);
+                        try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg);
+                        try self.asmRegisterRegister(.{ .v_, .movshdup }, tmp_reg, dst_reg);
                         try self.asmRegisterRegisterRegister(
                             switch (air_tag) {
-                                .add => .{ ._, .vaddss },
-                                .sub => .{ ._, .vsubss },
-                                .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivss },
-                                .max => .{ ._, .vmaxss },
-                                .min => .{ ._, .vmaxss },
+                                .add => .{ .v_ss, .add },
+                                .sub => .{ .v_ss, .sub },
+                                .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ss, .div },
+                                .max => .{ .v_ss, .max },
+                                .min => .{ .v_ss, .max },
                                 else => unreachable,
                             },
                             dst_reg,
@@ -6431,7 +6431,7 @@ fn genBinOp(
                             tmp_reg,
                         );
                         try self.asmRegisterRegisterImmediate(
-                            .{ ._, .vcvtps2ph },
+                            .{ .v_, .cvtps2ph },
                             dst_reg,
                             dst_reg,
                             Immediate.u(0b1_00),
@@ -6444,12 +6444,12 @@ fn genBinOp(
                         defer self.register_manager.unlockReg(tmp_lock);
 
                         if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
-                            .{ ._, .vpinsrd },
+                            .{ .vp_d, .insr },
                             dst_reg,
                             src_mcv.mem(.dword),
                             Immediate.u(1),
                         ) else try self.asmRegisterRegisterRegister(
-                            .{ ._, .vunpcklps },
+                            .{ .v_ps, .unpckl },
                             dst_reg,
                             dst_reg,
                             (if (src_mcv.isRegister())
@@ -6457,20 +6457,20 @@ fn genBinOp(
                             else
                                 try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
                         );
-                        try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg, dst_reg);
+                        try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg);
                         try self.asmRegisterRegisterRegister(
-                            .{ ._, .vmovhlps },
+                            .{ .v_ps, .movhl },
                             tmp_reg,
                             dst_reg,
                             dst_reg,
                         );
                         try self.asmRegisterRegisterRegister(
                             switch (air_tag) {
-                                .add => .{ ._, .vaddps },
-                                .sub => .{ ._, .vsubps },
-                                .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivps },
-                                .max => .{ ._, .vmaxps },
-                                .min => .{ ._, .vmaxps },
+                                .add => .{ .v_ps, .add },
+                                .sub => .{ .v_ps, .sub },
+                                .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ps, .div },
+                                .max => .{ .v_ps, .max },
+                                .min => .{ .v_ps, .max },
                                 else => unreachable,
                             },
                             dst_reg,
@@ -6478,7 +6478,7 @@ fn genBinOp(
                             tmp_reg,
                         );
                         try self.asmRegisterRegisterImmediate(
-                            .{ ._, .vcvtps2ph },
+                            .{ .v_, .cvtps2ph },
                             dst_reg,
                             dst_reg,
                             Immediate.u(0b1_00),
@@ -6490,13 +6490,13 @@ fn genBinOp(
                         const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
                         defer self.register_manager.unlockReg(tmp_lock);
 
-                        try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg, dst_reg);
+                        try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg);
                         if (src_mcv.isMemory()) try self.asmRegisterMemory(
-                            .{ ._, .vcvtph2ps },
+                            .{ .v_, .cvtph2ps },
                             tmp_reg,
                             src_mcv.mem(.qword),
                         ) else try self.asmRegisterRegister(
-                            .{ ._, .vcvtph2ps },
+                            .{ .v_, .cvtph2ps },
                             tmp_reg,
                             (if (src_mcv.isRegister())
                                 src_mcv.getReg().?
@@ -6505,11 +6505,11 @@ fn genBinOp(
                         );
                         try self.asmRegisterRegisterRegister(
                             switch (air_tag) {
-                                .add => .{ ._, .vaddps },
-                                .sub => .{ ._, .vsubps },
-                                .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivps },
-                                .max => .{ ._, .vmaxps },
-                                .min => .{ ._, .vmaxps },
+                                .add => .{ .v_ps, .add },
+                                .sub => .{ .v_ps, .sub },
+                                .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ps, .div },
+                                .max => .{ .v_ps, .max },
+                                .min => .{ .v_ps, .max },
                                 else => unreachable,
                             },
                             dst_reg,
@@ -6517,7 +6517,7 @@ fn genBinOp(
                             tmp_reg,
                         );
                         try self.asmRegisterRegisterImmediate(
-                            .{ ._, .vcvtps2ph },
+                            .{ .v_, .cvtps2ph },
                             dst_reg,
                             dst_reg,
                             Immediate.u(0b1_00),
@@ -6529,13 +6529,13 @@ fn genBinOp(
                         const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
                         defer self.register_manager.unlockReg(tmp_lock);
 
-                        try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, dst_reg.to256(), dst_reg);
+                        try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg.to256(), dst_reg);
                         if (src_mcv.isMemory()) try self.asmRegisterMemory(
-                            .{ ._, .vcvtph2ps },
+                            .{ .v_, .cvtph2ps },
                             tmp_reg,
                             src_mcv.mem(.xword),
                         ) else try self.asmRegisterRegister(
-                            .{ ._, .vcvtph2ps },
+                            .{ .v_, .cvtph2ps },
                             tmp_reg,
                             (if (src_mcv.isRegister())
                                 src_mcv.getReg().?
@@ -6544,11 +6544,11 @@ fn genBinOp(
                         );
                         try self.asmRegisterRegisterRegister(
                             switch (air_tag) {
-                                .add => .{ ._, .vaddps },
-                                .sub => .{ ._, .vsubps },
-                                .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivps },
-                                .max => .{ ._, .vmaxps },
-                                .min => .{ ._, .vmaxps },
+                                .add => .{ .v_ps, .add },
+                                .sub => .{ .v_ps, .sub },
+                                .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ps, .div },
+                                .max => .{ .v_ps, .max },
+                                .min => .{ .v_ps, .max },
                                 else => unreachable,
                             },
                             dst_reg.to256(),
@@ -6556,7 +6556,7 @@ fn genBinOp(
                             tmp_reg,
                         );
                         try self.asmRegisterRegisterImmediate(
-                            .{ ._, .vcvtps2ph },
+                            .{ .v_, .cvtps2ph },
                             dst_reg,
                             dst_reg.to256(),
                             Immediate.u(0b1_00),
@@ -6567,76 +6567,76 @@ fn genBinOp(
                 } else null,
                 32 => switch (lhs_ty.vectorLen()) {
                     1 => switch (air_tag) {
-                        .add => if (self.hasFeature(.avx)) .{ ._, .vaddss } else .{ ._, .addss },
-                        .sub => if (self.hasFeature(.avx)) .{ ._, .vsubss } else .{ ._, .subss },
-                        .mul => if (self.hasFeature(.avx)) .{ ._, .vmulss } else .{ ._, .mulss },
+                        .add => if (self.hasFeature(.avx)) .{ .v_ss, .add } else .{ ._ss, .add },
+                        .sub => if (self.hasFeature(.avx)) .{ .v_ss, .sub } else .{ ._ss, .sub },
+                        .mul => if (self.hasFeature(.avx)) .{ .v_ss, .mul } else .{ ._ss, .mul },
                         .div_float,
                         .div_trunc,
                         .div_floor,
                         .div_exact,
-                        => if (self.hasFeature(.avx)) .{ ._, .vdivss } else .{ ._, .divss },
-                        .max => if (self.hasFeature(.avx)) .{ ._, .vmaxss } else .{ ._, .maxss },
-                        .min => if (self.hasFeature(.avx)) .{ ._, .vminss } else .{ ._, .minss },
+                        => if (self.hasFeature(.avx)) .{ .v_ss, .div } else .{ ._ss, .div },
+                        .max => if (self.hasFeature(.avx)) .{ .v_ss, .max } else .{ ._ss, .max },
+                        .min => if (self.hasFeature(.avx)) .{ .v_ss, .min } else .{ ._ss, .min },
                         else => unreachable,
                     },
                     2...4 => switch (air_tag) {
-                        .add => if (self.hasFeature(.avx)) .{ ._, .vaddps } else .{ ._, .addps },
-                        .sub => if (self.hasFeature(.avx)) .{ ._, .vsubps } else .{ ._, .subps },
-                        .mul => if (self.hasFeature(.avx)) .{ ._, .vmulps } else .{ ._, .mulps },
+                        .add => if (self.hasFeature(.avx)) .{ .v_ps, .add } else .{ ._ps, .add },
+                        .sub => if (self.hasFeature(.avx)) .{ .v_ps, .sub } else .{ ._ps, .sub },
+                        .mul => if (self.hasFeature(.avx)) .{ .v_ps, .mul } else .{ ._ps, .mul },
                         .div_float,
                         .div_trunc,
                         .div_floor,
                         .div_exact,
-                        => if (self.hasFeature(.avx)) .{ ._, .vdivps } else .{ ._, .divps },
-                        .max => if (self.hasFeature(.avx)) .{ ._, .vmaxps } else .{ ._, .maxps },
-                        .min => if (self.hasFeature(.avx)) .{ ._, .vminps } else .{ ._, .minps },
+                        => if (self.hasFeature(.avx)) .{ .v_ps, .div } else .{ ._ps, .div },
+                        .max => if (self.hasFeature(.avx)) .{ .v_ps, .max } else .{ ._ps, .max },
+                        .min => if (self.hasFeature(.avx)) .{ .v_ps, .min } else .{ ._ps, .min },
                         else => unreachable,
                     },
                     5...8 => if (self.hasFeature(.avx)) switch (air_tag) {
-                        .add => .{ ._, .vaddps },
-                        .sub => .{ ._, .vsubps },
-                        .mul => .{ ._, .vmulps },
-                        .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivps },
-                        .max => .{ ._, .vmaxps },
-                        .min => .{ ._, .vminps },
+                        .add => .{ .v_ps, .add },
+                        .sub => .{ .v_ps, .sub },
+                        .mul => .{ .v_ps, .mul },
+                        .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_ps, .div },
+                        .max => .{ .v_ps, .max },
+                        .min => .{ .v_ps, .min },
                         else => unreachable,
                     } else null,
                     else => null,
                 },
                 64 => switch (lhs_ty.vectorLen()) {
                     1 => switch (air_tag) {
-                        .add => if (self.hasFeature(.avx)) .{ ._, .vaddsd } else .{ ._, .addsd },
-                        .sub => if (self.hasFeature(.avx)) .{ ._, .vsubsd } else .{ ._, .subsd },
-                        .mul => if (self.hasFeature(.avx)) .{ ._, .vmulsd } else .{ ._, .mulsd },
+                        .add => if (self.hasFeature(.avx)) .{ .v_sd, .add } else .{ ._sd, .add },
+                        .sub => if (self.hasFeature(.avx)) .{ .v_sd, .sub } else .{ ._sd, .sub },
+                        .mul => if (self.hasFeature(.avx)) .{ .v_sd, .mul } else .{ ._sd, .mul },
                         .div_float,
                         .div_trunc,
                         .div_floor,
                         .div_exact,
-                        => if (self.hasFeature(.avx)) .{ ._, .vdivsd } else .{ ._, .divsd },
-                        .max => if (self.hasFeature(.avx)) .{ ._, .vmaxsd } else .{ ._, .maxsd },
-                        .min => if (self.hasFeature(.avx)) .{ ._, .vminsd } else .{ ._, .minsd },
+                        => if (self.hasFeature(.avx)) .{ .v_sd, .div } else .{ ._sd, .div },
+                        .max => if (self.hasFeature(.avx)) .{ .v_sd, .max } else .{ ._sd, .max },
+                        .min => if (self.hasFeature(.avx)) .{ .v_sd, .min } else .{ ._sd, .min },
                         else => unreachable,
                     },
                     2 => switch (air_tag) {
-                        .add => if (self.hasFeature(.avx)) .{ ._, .vaddpd } else .{ ._, .addpd },
-                        .sub => if (self.hasFeature(.avx)) .{ ._, .vsubpd } else .{ ._, .subpd },
-                        .mul => if (self.hasFeature(.avx)) .{ ._, .vmulpd } else .{ ._, .mulpd },
+                        .add => if (self.hasFeature(.avx)) .{ .v_pd, .add } else .{ ._pd, .add },
+                        .sub => if (self.hasFeature(.avx)) .{ .v_pd, .sub } else .{ ._pd, .sub },
+                        .mul => if (self.hasFeature(.avx)) .{ .v_pd, .mul } else .{ ._pd, .mul },
                         .div_float,
                         .div_trunc,
                         .div_floor,
                         .div_exact,
-                        => if (self.hasFeature(.avx)) .{ ._, .vdivpd } else .{ ._, .divpd },
-                        .max => if (self.hasFeature(.avx)) .{ ._, .vmaxpd } else .{ ._, .maxpd },
-                        .min => if (self.hasFeature(.avx)) .{ ._, .vminpd } else .{ ._, .minpd },
+                        => if (self.hasFeature(.avx)) .{ .v_pd, .div } else .{ ._pd, .div },
+                        .max => if (self.hasFeature(.avx)) .{ .v_pd, .max } else .{ ._pd, .max },
+                        .min => if (self.hasFeature(.avx)) .{ .v_pd, .min } else .{ ._pd, .min },
                         else => unreachable,
                     },
                     3...4 => if (self.hasFeature(.avx)) switch (air_tag) {
-                        .add => .{ ._, .vaddpd },
-                        .sub => .{ ._, .vsubpd },
-                        .mul => .{ ._, .vmulpd },
-                        .div_float, .div_trunc, .div_floor, .div_exact => .{ ._, .vdivpd },
-                        .max => .{ ._, .vmaxpd },
-                        .min => .{ ._, .vminpd },
+                        .add => .{ .v_pd, .add },
+                        .sub => .{ .v_pd, .sub },
+                        .mul => .{ .v_pd, .mul },
+                        .div_float, .div_trunc, .div_floor, .div_exact => .{ .v_pd, .div },
+                        .max => .{ .v_pd, .max },
+                        .min => .{ .v_pd, .min },
                         else => unreachable,
                     } else null,
                     else => null,
@@ -7563,13 +7563,13 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
                         defer self.register_manager.unlockReg(tmp2_lock);
 
                         if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
-                            .{ ._, .vpinsrw },
+                            .{ .vp_w, .insr },
                             tmp1_reg,
                             dst_reg.to128(),
                             src_mcv.mem(.word),
                             Immediate.u(1),
                         ) else try self.asmRegisterRegisterRegister(
-                            .{ ._, .vpunpcklwd },
+                            .{ .vp_, .unpcklwd },
                             tmp1_reg,
                             dst_reg.to128(),
                             (if (src_mcv.isRegister())
@@ -7577,20 +7577,20 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
                             else
                                 try self.copyToTmpRegister(ty, src_mcv)).to128(),
                         );
-                        try self.asmRegisterRegister(.{ ._, .vcvtph2ps }, tmp1_reg, tmp1_reg);
-                        try self.asmRegisterRegister(.{ ._, .vmovshdup }, tmp2_reg, tmp1_reg);
-                        try self.genBinOpMir(.{ ._, .ucomiss }, ty, tmp1_mcv, tmp2_mcv);
+                        try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, tmp1_reg, tmp1_reg);
+                        try self.asmRegisterRegister(.{ .v_, .movshdup }, tmp2_reg, tmp1_reg);
+                        try self.genBinOpMir(.{ ._ss, .ucomi }, ty, tmp1_mcv, tmp2_mcv);
                     } else return self.fail("TODO implement airCmp for {}", .{
                         ty.fmt(self.bin_file.options.module.?),
                     }),
                     32 => try self.genBinOpMir(
-                        .{ ._, .ucomiss },
+                        .{ ._ss, .ucomi },
                         ty,
                         .{ .register = dst_reg },
                         src_mcv,
                     ),
                     64 => try self.genBinOpMir(
-                        .{ ._, .ucomisd },
+                        .{ ._sd, .ucomi },
                         ty,
                         .{ .register = dst_reg },
                         src_mcv,
@@ -8573,42 +8573,42 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag {
         else => return .{ ._, .mov },
         .Float => switch (ty.floatBits(self.target.*)) {
             16 => unreachable, // needs special handling
-            32 => return if (self.hasFeature(.avx)) .{ ._, .vmovss } else .{ ._, .movss },
-            64 => return if (self.hasFeature(.avx)) .{ ._, .vmovsd } else .{ ._, .movsd },
+            32 => return if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov },
+            64 => return if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov },
             128 => return if (self.hasFeature(.avx))
-                if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups }
-            else if (aligned) .{ ._, .movaps } else .{ ._, .movups },
+                if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
+            else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
             else => {},
         },
         .Vector => switch (ty.childType().zigTypeTag()) {
             .Float => switch (ty.childType().floatBits(self.target.*)) {
                 16 => switch (ty.vectorLen()) {
                     1 => unreachable, // needs special handling
-                    2 => return if (self.hasFeature(.avx)) .{ ._, .vmovss } else .{ ._, .movss },
-                    3...4 => return if (self.hasFeature(.avx)) .{ ._, .vmovsd } else .{ ._, .movsd },
+                    2 => return if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov },
+                    3...4 => return if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov },
                     5...8 => return if (self.hasFeature(.avx))
-                        if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups }
-                    else if (aligned) .{ ._, .movaps } else .{ ._, .movups },
+                        if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
+                    else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
                     9...16 => if (self.hasFeature(.avx))
-                        return if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups },
+                        return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
                     else => {},
                 },
                 32 => switch (ty.vectorLen()) {
-                    1 => return if (self.hasFeature(.avx)) .{ ._, .vmovss } else .{ ._, .movss },
+                    1 => return if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov },
                     2...4 => return if (self.hasFeature(.avx))
-                        if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups }
-                    else if (aligned) .{ ._, .movaps } else .{ ._, .movups },
+                        if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
+                    else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
                     5...8 => if (self.hasFeature(.avx))
-                        return if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups },
+                        return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
                     else => {},
                 },
                 64 => switch (ty.vectorLen()) {
-                    1 => return if (self.hasFeature(.avx)) .{ ._, .vmovsd } else .{ ._, .movsd },
+                    1 => return if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov },
                     2 => return if (self.hasFeature(.avx))
-                        if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups }
-                    else if (aligned) .{ ._, .movaps } else .{ ._, .movups },
+                        if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
+                    else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
                     3...4 => if (self.hasFeature(.avx))
-                        return if (aligned) .{ ._, .vmovaps } else .{ ._, .vmovups },
+                        return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
                     else => {},
                 },
                 else => {},
@@ -8724,11 +8724,11 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
             if ((dst_reg.class() == .floating_point) == (src_reg.class() == .floating_point))
                 switch (ty.zigTypeTag()) {
                     else => .{ ._, .mov },
-                    .Float, .Vector => .{ ._, .movaps },
+                    .Float, .Vector => .{ ._ps, .mova },
                 }
             else switch (abi_size) {
                 2 => return try self.asmRegisterRegisterImmediate(
-                    if (dst_reg.class() == .floating_point) .{ ._, .pinsrw } else .{ ._, .pextrw },
+                    if (dst_reg.class() == .floating_point) .{ .p_w, .insr } else .{ .p_w, .extr },
                     registerAlias(dst_reg, 4),
                     registerAlias(src_reg, 4),
                     Immediate.u(0),
@@ -8761,7 +8761,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
             });
             if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16)
                 try self.asmRegisterMemoryImmediate(
-                    .{ ._, .pinsrw },
+                    .{ .p_w, .insr },
                     registerAlias(dst_reg, abi_size),
                     src_mem,
                     Immediate.u(0),
@@ -8794,7 +8794,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
                     });
                     return if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16)
                         self.asmRegisterMemoryImmediate(
-                            .{ ._, .pinsrw },
+                            .{ .p_w, .insr },
                             registerAlias(dst_reg, abi_size),
                             src_mem,
                             Immediate.u(0),
@@ -8838,7 +8838,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
             });
             if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16)
                 try self.asmRegisterMemoryImmediate(
-                    .{ ._, .pinsrw },
+                    .{ .p_w, .insr },
                     registerAlias(dst_reg, abi_size),
                     src_mem,
                     Immediate.u(0),
@@ -8952,7 +8952,7 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal
             );
             if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16)
                 try self.asmMemoryRegisterImmediate(
-                    .{ ._, .pextrw },
+                    .{ .p_w, .extr },
                     dst_mem,
                     src_reg.to128(),
                     Immediate.u(0),
@@ -9069,7 +9069,7 @@ fn genInlineMemcpyRegisterRegister(
         try self.asmMemoryRegister(
             switch (src_reg.class()) {
                 .general_purpose, .segment => .{ ._, .mov },
-                .floating_point => .{ ._, .movss },
+                .floating_point => .{ ._ss, .mov },
             },
             Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = dst_reg, .disp = -offset }),
             registerAlias(src_reg, abi_size),
@@ -10197,21 +10197,21 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
         if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 }))
             switch (ty.zigTypeTag()) {
                 .Float => switch (ty.floatBits(self.target.*)) {
-                    32 => .{ ._, .vfmadd132ss },
-                    64 => .{ ._, .vfmadd132sd },
+                    32 => .{ .v_ss, .fmadd132 },
+                    64 => .{ .v_sd, .fmadd132 },
                     16, 80, 128 => null,
                     else => unreachable,
                 },
                 .Vector => switch (ty.childType().zigTypeTag()) {
                     .Float => switch (ty.childType().floatBits(self.target.*)) {
                         32 => switch (ty.vectorLen()) {
-                            1 => .{ ._, .vfmadd132ss },
-                            2...8 => .{ ._, .vfmadd132ps },
+                            1 => .{ .v_ss, .fmadd132 },
+                            2...8 => .{ .v_ps, .fmadd132 },
                             else => null,
                         },
                         64 => switch (ty.vectorLen()) {
-                            1 => .{ ._, .vfmadd132sd },
-                            2...4 => .{ ._, .vfmadd132pd },
+                            1 => .{ .v_sd, .fmadd132 },
+                            2...4 => .{ .v_pd, .fmadd132 },
                             else => null,
                         },
                         16, 80, 128 => null,
@@ -10224,21 +10224,21 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
         else if (mem.eql(u2, &order, &.{ 2, 1, 3 }) or mem.eql(u2, &order, &.{ 1, 2, 3 }))
             switch (ty.zigTypeTag()) {
                 .Float => switch (ty.floatBits(self.target.*)) {
-                    32 => .{ ._, .vfmadd213ss },
-                    64 => .{ ._, .vfmadd213sd },
+                    32 => .{ .v_ss, .fmadd213 },
+                    64 => .{ .v_sd, .fmadd213 },
                     16, 80, 128 => null,
                     else => unreachable,
                 },
                 .Vector => switch (ty.childType().zigTypeTag()) {
                     .Float => switch (ty.childType().floatBits(self.target.*)) {
                         32 => switch (ty.vectorLen()) {
-                            1 => .{ ._, .vfmadd213ss },
-                            2...8 => .{ ._, .vfmadd213ps },
+                            1 => .{ .v_ss, .fmadd213 },
+                            2...8 => .{ .v_ps, .fmadd213 },
                             else => null,
                         },
                         64 => switch (ty.vectorLen()) {
-                            1 => .{ ._, .vfmadd213sd },
-                            2...4 => .{ ._, .vfmadd213pd },
+                            1 => .{ .v_sd, .fmadd213 },
+                            2...4 => .{ .v_pd, .fmadd213 },
                             else => null,
                         },
                         16, 80, 128 => null,
@@ -10251,21 +10251,21 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
         else if (mem.eql(u2, &order, &.{ 2, 3, 1 }) or mem.eql(u2, &order, &.{ 3, 2, 1 }))
             switch (ty.zigTypeTag()) {
                 .Float => switch (ty.floatBits(self.target.*)) {
-                    32 => .{ ._, .vfmadd231ss },
-                    64 => .{ ._, .vfmadd231sd },
+                    32 => .{ .v_ss, .fmadd231 },
+                    64 => .{ .v_sd, .fmadd231 },
                     16, 80, 128 => null,
                     else => unreachable,
                 },
                 .Vector => switch (ty.childType().zigTypeTag()) {
                     .Float => switch (ty.childType().floatBits(self.target.*)) {
                         32 => switch (ty.vectorLen()) {
-                            1 => .{ ._, .vfmadd231ss },
-                            2...8 => .{ ._, .vfmadd231ps },
+                            1 => .{ .v_ss, .fmadd231 },
+                            2...8 => .{ .v_ps, .fmadd231 },
                             else => null,
                         },
                         64 => switch (ty.vectorLen()) {
-                            1 => .{ ._, .vfmadd231sd },
-                            2...4 => .{ ._, .vfmadd231pd },
+                            1 => .{ .v_sd, .fmadd231 },
+                            2...4 => .{ .v_pd, .fmadd231 },
                             else => null,
                         },
                         16, 80, 128 => null,
src/arch/x86_64/Mir.zig
@@ -278,8 +278,14 @@ pub const Inst = struct {
         /// Add with carry
         adc,
         /// Add
+        /// Add packed single-precision floating-point values
+        /// Add scalar single-precision floating-point values
+        /// Add packed double-precision floating-point values
+        /// Add scalar double-precision floating-point values
         add,
         /// Logical and
+        /// Bitwise logical and of packed single-precision floating-point values
+        /// Bitwise logical and of packed double-precision floating-point values
         @"and",
         /// Bit scan forward
         bsf,
@@ -304,6 +310,8 @@ pub const Inst = struct {
         cmov,
         /// Logical compare
         /// Compare string
+        /// Compare scalar single-precision floating-point values
+        /// Compare scalar double-precision floating-point values
         cmp,
         /// Compare and exchange
         /// Compare and exchange bytes
@@ -316,6 +324,10 @@ pub const Inst = struct {
         cwde,
         /// Unsigned division
         /// Signed division
+        /// Divide packed single-precision floating-point values
+        /// Divide scalar single-precision floating-point values
+        /// Divide packed double-precision floating-point values
+        /// Divide scalar double-precision floating-point values
         div,
         ///
         int3,
@@ -339,6 +351,8 @@ pub const Inst = struct {
         mfence,
         /// Move
         /// Move data from string to string
+        /// Move scalar single-precision floating-point value
+        /// Move scalar double-precision floating-point value
         /// Move doubleword
         /// Move quadword
         mov,
@@ -350,6 +364,10 @@ pub const Inst = struct {
         movzx,
         /// Multiply
         /// Signed multiplication
+        /// Multiply packed single-precision floating-point values
+        /// Multiply scalar single-precision floating-point values
+        /// Multiply packed double-precision floating-point values
+        /// Multiply scalar double-precision floating-point values
         mul,
         /// Two's complement negation
         neg,
@@ -358,6 +376,8 @@ pub const Inst = struct {
         /// One's complement negation
         not,
         /// Logical or
+        /// Bitwise logical or of packed single-precision floating-point values
+        /// Bitwise logical or of packed double-precision floating-point values
         @"or",
         /// Pop
         pop,
@@ -390,6 +410,10 @@ pub const Inst = struct {
         /// Double precision shift right
         sh,
         /// Subtract
+        /// Subtract packed single-precision floating-point values
+        /// Subtract scalar single-precision floating-point values
+        /// Subtract packed double-precision floating-point values
+        /// Subtract scalar double-precision floating-point values
         sub,
         /// Store string
         sto,
@@ -406,145 +430,88 @@ pub const Inst = struct {
         /// Exchange register/memory with register
         xchg,
         /// Logical exclusive-or
+        /// Bitwise logical xor of packed single-precision floating-point values
+        /// Bitwise logical xor of packed double-precision floating-point values
         xor,
 
-        /// Add packed single-precision floating-point values
-        addps,
-        /// Add scalar single-precision floating-point values
-        addss,
-        /// Bitwise logical and of packed single precision floating-point values
-        andps,
-        /// Bitwise logical and not of packed single precision floating-point values
-        andnps,
-        /// Compare scalar single-precision floating-point values
-        cmpss,
+        /// Bitwise logical and not of packed single-precision floating-point values
+        /// Bitwise logical and not of packed double-precision floating-point values
+        andn,
         /// Convert doubleword integer to scalar single-precision floating-point value
         cvtsi2ss,
-        /// Divide packed single-precision floating-point values
-        divps,
-        /// Divide scalar single-precision floating-point values
-        divss,
         /// Maximum of packed single-precision floating-point values
-        maxps,
         /// Maximum of scalar single-precision floating-point values
-        maxss,
+        /// Maximum of packed double-precision floating-point values
+        /// Maximum of scalar double-precision floating-point values
+        max,
         /// Minimum of packed single-precision floating-point values
-        minps,
         /// Minimum of scalar single-precision floating-point values
-        minss,
+        /// Minimum of packed double-precision floating-point values
+        /// Minimum of scalar double-precision floating-point values
+        min,
         /// Move aligned packed single-precision floating-point values
-        movaps,
+        /// Move aligned packed double-precision floating-point values
+        mova,
         /// Move packed single-precision floating-point values high to low
-        movhlps,
-        /// Move scalar single-precision floating-point value
-        movss,
+        movhl,
         /// Move unaligned packed single-precision floating-point values
-        movups,
-        /// Multiply packed single-precision floating-point values
-        mulps,
-        /// Multiply scalar single-precision floating-point values
-        mulss,
-        /// Bitwise logical or of packed single precision floating-point values
-        orps,
+        /// Move unaligned packed double-precision floating-point values
+        movu,
+        /// Extract byte
         /// Extract word
-        pextrw,
+        /// Extract doubleword
+        /// Extract quadword
+        extr,
+        /// Insert byte
         /// Insert word
-        pinsrw,
+        /// Insert doubleword
+        /// Insert quadword
+        insr,
         /// Square root of packed single-precision floating-point values
-        sqrtps,
         /// Square root of scalar single-precision floating-point value
-        sqrtss,
-        /// Subtract packed single-precision floating-point values
-        subps,
-        /// Subtract scalar single-precision floating-point values
-        subss,
+        /// Square root of packed double-precision floating-point values
+        /// Square root of scalar double-precision floating-point value
+        sqrt,
         /// Unordered compare scalar single-precision floating-point values
-        ucomiss,
+        /// Unordered compare scalar double-precision floating-point values
+        ucomi,
         /// Unpack and interleave high packed single-precision floating-point values
-        unpckhps,
+        /// Unpack and interleave high packed double-precision floating-point values
+        unpckh,
         /// Unpack and interleave low packed single-precision floating-point values
-        unpcklps,
-        /// Bitwise logical xor of packed single precision floating-point values
-        xorps,
+        /// Unpack and interleave low packed double-precision floating-point values
+        unpckl,
 
-        /// Add packed double-precision floating-point values
-        addpd,
-        /// Add scalar double-precision floating-point values
-        addsd,
-        /// Bitwise logical and not of packed double precision floating-point values
-        andnpd,
-        /// Bitwise logical and of packed double precision floating-point values
-        andpd,
-        /// Compare scalar double-precision floating-point values
-        cmpsd,
         /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value
         cvtsd2ss,
         /// Convert doubleword integer to scalar double-precision floating-point value
         cvtsi2sd,
         /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
         cvtss2sd,
-        /// Divide packed double-precision floating-point values
-        divpd,
-        /// Divide scalar double-precision floating-point values
-        divsd,
-        /// Maximum of packed double-precision floating-point values
-        maxpd,
-        /// Maximum of scalar double-precision floating-point values
-        maxsd,
-        /// Minimum of packed double-precision floating-point values
-        minpd,
-        /// Minimum of scalar double-precision floating-point values
-        minsd,
-        /// Move scalar double-precision floating-point value
-        movsd,
-        /// Multiply packed double-precision floating-point values
-        mulpd,
-        /// Multiply scalar double-precision floating-point values
-        mulsd,
-        /// Bitwise logical or of packed double precision floating-point values
-        orpd,
         /// Shuffle packed high words
-        pshufhw,
+        shufh,
         /// Shuffle packed low words
-        pshuflw,
+        shufl,
         /// Shift packed data right logical
-        psrld,
         /// Shift packed data right logical
-        psrlq,
         /// Shift packed data right logical
-        psrlw,
+        srl,
         /// Unpack high data
-        punpckhbw,
+        unpckhbw,
         /// Unpack high data
-        punpckhdq,
+        unpckhdq,
         /// Unpack high data
-        punpckhqdq,
+        unpckhqdq,
         /// Unpack high data
-        punpckhwd,
+        unpckhwd,
         /// Unpack low data
-        punpcklbw,
+        unpcklbw,
         /// Unpack low data
-        punpckldq,
+        unpckldq,
         /// Unpack low data
-        punpcklqdq,
+        unpcklqdq,
         /// Unpack low data
-        punpcklwd,
-        /// Square root of double precision floating-point values
-        sqrtpd,
-        /// Square root of scalar double precision floating-point value
-        sqrtsd,
-        /// Subtract packed double-precision floating-point values
-        subpd,
-        /// Subtract scalar double-precision floating-point values
-        subsd,
-        /// Unordered compare scalar double-precision floating-point values
-        ucomisd,
-        /// Unpack and interleave high packed double-precision floating-point values
-        unpckhpd,
-        /// Unpack and interleave low packed double-precision floating-point values
-        unpcklpd,
-        /// Bitwise logical xor of packed double precision floating-point values
-        xorpd,
+        unpcklwd,
 
         /// Replicate double floating-point values
         movddup,
@@ -553,199 +520,32 @@ pub const Inst = struct {
         /// Replicate single floating-point values
         movsldup,
 
-        /// Extract Byte
-        pextrb,
-        /// Extract Doubleword
-        pextrd,
-        /// Extract Quadword
-        pextrq,
-        /// Insert Byte
-        pinsrb,
-        /// Insert Doubleword
-        pinsrd,
-        /// Insert Quadword
-        pinsrq,
-        /// Round packed double-precision floating-point values
-        roundpd,
         /// Round packed single-precision floating-point values
-        roundps,
-        /// Round scalar double-precision floating-point value
-        roundsd,
         /// Round scalar single-precision floating-point value
-        roundss,
-
-        /// Add packed double-precision floating-point values
-        vaddpd,
-        /// Add packed single-precision floating-point values
-        vaddps,
-        /// Add scalar double-precision floating-point values
-        vaddsd,
-        /// Add scalar single-precision floating-point values
-        vaddss,
-        /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value
-        vcvtsd2ss,
-        /// Convert doubleword integer to scalar double-precision floating-point value
-        vcvtsi2sd,
-        /// Convert doubleword integer to scalar single-precision floating-point value
-        vcvtsi2ss,
-        /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
-        vcvtss2sd,
-        /// Divide packed double-precision floating-point values
-        vdivpd,
-        /// Divide packed single-precision floating-point values
-        vdivps,
-        /// Divide scalar double-precision floating-point values
-        vdivsd,
-        /// Divide scalar single-precision floating-point values
-        vdivss,
-        /// Maximum of packed double-precision floating-point values
-        vmaxpd,
-        /// Maximum of packed single-precision floating-point values
-        vmaxps,
-        /// Maximum of scalar double-precision floating-point values
-        vmaxsd,
-        /// Maximum of scalar single-precision floating-point values
-        vmaxss,
-        /// Minimum of packed double-precision floating-point values
-        vminpd,
-        /// Minimum of packed single-precision floating-point values
-        vminps,
-        /// Minimum of scalar double-precision floating-point values
-        vminsd,
-        /// Minimum of scalar single-precision floating-point values
-        vminss,
-        /// Move aligned packed double-precision floating-point values
-        vmovapd,
-        /// Move aligned packed single-precision floating-point values
-        vmovaps,
-        /// Move packed single-precision floating-point values high to low
-        vmovhlps,
-        /// Replicate double floating-point values
-        vmovddup,
-        /// Move or merge scalar double-precision floating-point value
-        vmovsd,
-        /// Replicate single floating-point values
-        vmovshdup,
-        /// Replicate single floating-point values
-        vmovsldup,
-        /// Move or merge scalar single-precision floating-point value
-        vmovss,
-        /// Move unaligned packed double-precision floating-point values
-        vmovupd,
-        /// Move unaligned packed single-precision floating-point values
-        vmovups,
-        /// Multiply packed double-precision floating-point values
-        vmulpd,
-        /// Multiply packed single-precision floating-point values
-        vmulps,
-        /// Multiply scalar double-precision floating-point values
-        vmulsd,
-        /// Multiply scalar single-precision floating-point values
-        vmulss,
-        /// Extract Byte
-        vpextrb,
-        /// Extract Doubleword
-        vpextrd,
-        /// Extract Quadword
-        vpextrq,
-        /// Extract word
-        vpextrw,
-        /// Insert Byte
-        vpinsrb,
-        /// Insert Doubleword
-        vpinsrd,
-        /// Insert Quadword
-        vpinsrq,
-        /// Insert word
-        vpinsrw,
-        /// Shuffle packed high words
-        vpshufhw,
-        /// Shuffle packed low words
-        vpshuflw,
-        /// Shift packed data right logical
-        vpsrld,
-        /// Shift packed data right logical
-        vpsrlq,
-        /// Shift packed data right logical
-        vpsrlw,
-        /// Unpack high data
-        vpunpckhbw,
-        /// Unpack high data
-        vpunpckhdq,
-        /// Unpack high data
-        vpunpckhqdq,
-        /// Unpack high data
-        vpunpckhwd,
-        /// Unpack low data
-        vpunpcklbw,
-        /// Unpack low data
-        vpunpckldq,
-        /// Unpack low data
-        vpunpcklqdq,
-        /// Unpack low data
-        vpunpcklwd,
         /// Round packed double-precision floating-point values
-        vroundpd,
-        /// Round packed single-precision floating-point values
-        vroundps,
         /// Round scalar double-precision floating-point value
-        vroundsd,
-        /// Round scalar single-precision floating-point value
-        vroundss,
-        /// Square root of packed double-precision floating-point value
-        vsqrtpd,
-        /// Square root of packed single-precision floating-point value
-        vsqrtps,
-        /// Square root of scalar double-precision floating-point value
-        vsqrtsd,
-        /// Square root of scalar single-precision floating-point value
-        vsqrtss,
-        /// Subtract packed double-precision floating-point values
-        vsubpd,
-        /// Subtract packed single-precision floating-point values
-        vsubps,
-        /// Subtract scalar double-precision floating-point values
-        vsubsd,
-        /// Subtract scalar single-precision floating-point values
-        vsubss,
-        /// Unpack and interleave high packed double-precision floating-point values
-        vunpckhpd,
-        /// Unpack and interleave high packed single-precision floating-point values
-        vunpckhps,
-        /// Unpack and interleave low packed double-precision floating-point values
-        vunpcklpd,
-        /// Unpack and interleave low packed single-precision floating-point values
-        vunpcklps,
+        round,
 
         /// Convert 16-bit floating-point values to single-precision floating-point values
-        vcvtph2ps,
+        cvtph2ps,
         /// Convert single-precision floating-point values to 16-bit floating-point values
-        vcvtps2ph,
+        cvtps2ph,
 
-        /// Fused multiply-add of packed double-precision floating-point values
-        vfmadd132pd,
-        /// Fused multiply-add of packed double-precision floating-point values
-        vfmadd213pd,
-        /// Fused multiply-add of packed double-precision floating-point values
-        vfmadd231pd,
-        /// Fused multiply-add of packed single-precision floating-point values
-        vfmadd132ps,
         /// Fused multiply-add of packed single-precision floating-point values
-        vfmadd213ps,
-        /// Fused multiply-add of packed single-precision floating-point values
-        vfmadd231ps,
-        /// Fused multiply-add of scalar double-precision floating-point values
-        vfmadd132sd,
-        /// Fused multiply-add of scalar double-precision floating-point values
-        vfmadd213sd,
-        /// Fused multiply-add of scalar double-precision floating-point values
-        vfmadd231sd,
         /// Fused multiply-add of scalar single-precision floating-point values
-        vfmadd132ss,
+        /// Fused multiply-add of packed double-precision floating-point values
+        /// Fused multiply-add of scalar double-precision floating-point values
+        fmadd132,
+        /// Fused multiply-add of packed single-precision floating-point values
         /// Fused multiply-add of scalar single-precision floating-point values
-        vfmadd213ss,
+        /// Fused multiply-add of packed double-precision floating-point values
+        /// Fused multiply-add of scalar double-precision floating-point values
+        fmadd213,
+        /// Fused multiply-add of packed single-precision floating-point values
         /// Fused multiply-add of scalar single-precision floating-point values
-        vfmadd231ss,
+        /// Fused multiply-add of packed double-precision floating-point values
+        /// Fused multiply-add of scalar double-precision floating-point values
+        fmadd231,
 
         /// A pseudo instruction that requires special lowering.
         /// This should be the only tag in this enum that doesn't