Commit 057139fda5

Jacob Young <jacobly0@users.noreply.github.com>
2023-05-07 15:06:12
x86_64: implement binary operations for float vectors
1 parent ea957c4
src/arch/x86_64/CodeGen.zig
@@ -1176,6 +1176,21 @@ fn asmRegisterRegisterRegister(
     });
 }
 
+fn asmRegisterRegisterRegisterImmediate(
+    self: *Self,
+    tag: Mir.Inst.Tag,
+    reg1: Register,
+    reg2: Register,
+    reg3: Register,
+    imm: Immediate,
+) !void {
+    _ = try self.addInst(.{
+        .tag = tag,
+        .ops = .rrri,
+        .data = .{ .rrri = .{ .r1 = reg1, .r2 = reg2, .r3 = reg3, .i = @intCast(u8, imm.unsigned) } },
+    });
+}
+
 fn asmRegisterRegisterImmediate(
     self: *Self,
     tag: Mir.Inst.Tag,
@@ -2310,20 +2325,31 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
             }),
         }
     } else if (src_bits == 64 and dst_bits == 32) {
-        if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister(
+        if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
             .vcvtsd2ss,
             dst_reg,
             dst_reg,
-            src_mcv.getReg().?.to128(),
-        ) else try self.asmRegisterRegisterMemory(
+            src_mcv.mem(.qword),
+        ) else try self.asmRegisterRegisterRegister(
             .vcvtsd2ss,
             dst_reg,
             dst_reg,
+            (if (src_mcv.isRegister())
+                src_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
+        ) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
+            .cvtsd2ss,
+            dst_reg,
             src_mcv.mem(.qword),
-        ) else if (src_mcv.isRegister())
-            try self.asmRegisterRegister(.cvtsd2ss, dst_reg, src_mcv.getReg().?.to128())
-        else
-            try self.asmRegisterMemory(.cvtsd2ss, dst_reg, src_mcv.mem(.qword));
+        ) else try self.asmRegisterRegister(
+            .cvtsd2ss,
+            dst_reg,
+            (if (src_mcv.isRegister())
+                src_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
+        );
     } else return self.fail("TODO implement airFptrunc from {} to {}", .{
         src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
     });
@@ -2360,20 +2386,31 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
             }),
         }
     } else if (src_bits == 32 and dst_bits == 64) {
-        if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister(
+        if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
             .vcvtss2sd,
             dst_reg,
             dst_reg,
-            src_mcv.getReg().?.to128(),
-        ) else try self.asmRegisterRegisterMemory(
+            src_mcv.mem(.dword),
+        ) else try self.asmRegisterRegisterRegister(
             .vcvtss2sd,
             dst_reg,
             dst_reg,
+            (if (src_mcv.isRegister())
+                src_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
+        ) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
+            .cvtss2sd,
+            dst_reg,
             src_mcv.mem(.dword),
-        ) else if (src_mcv.isRegister())
-            try self.asmRegisterRegister(.cvtss2sd, dst_reg, src_mcv.getReg().?.to128())
-        else
-            try self.asmRegisterMemory(.cvtss2sd, dst_reg, src_mcv.mem(.dword));
+        ) else try self.asmRegisterRegister(
+            .cvtss2sd,
+            dst_reg,
+            (if (src_mcv.isRegister())
+                src_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
+        );
     } else return self.fail("TODO implement airFpext from {} to {}", .{
         src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
     });
@@ -4532,7 +4569,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
     defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
     const result: MCValue = result: {
-        const tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) {
+        const mir_tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) {
             .Float => switch (ty.floatBits(self.target.*)) {
                 16 => if (self.hasFeature(.f16c)) {
                     const mat_src_reg = if (src_mcv.isRegister())
@@ -4558,11 +4595,14 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                 .Float => switch (ty.childType().floatBits(self.target.*)) {
                     16 => if (self.hasFeature(.f16c)) switch (ty.vectorLen()) {
                         1 => {
-                            const mat_src_reg = if (src_mcv.isRegister())
-                                src_mcv.getReg().?
-                            else
-                                try self.copyToTmpRegister(ty, src_mcv);
-                            try self.asmRegisterRegister(.vcvtph2ps, dst_reg, mat_src_reg.to128());
+                            try self.asmRegisterRegister(
+                                .vcvtph2ps,
+                                dst_reg,
+                                (if (src_mcv.isRegister())
+                                    src_mcv.getReg().?
+                                else
+                                    try self.copyToTmpRegister(ty, src_mcv)).to128(),
+                            );
                             try self.asmRegisterRegisterRegister(.vsqrtss, dst_reg, dst_reg, dst_reg);
                             try self.asmRegisterRegisterImmediate(
                                 .vcvtps2ph,
@@ -4574,16 +4614,19 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
                         },
                         2...8 => {
                             const wide_reg = registerAlias(dst_reg, abi_size * 2);
-                            if (src_mcv.isRegister()) try self.asmRegisterRegister(
-                                .vcvtph2ps,
-                                wide_reg,
-                                src_mcv.getReg().?.to128(),
-                            ) else try self.asmRegisterMemory(
+                            if (src_mcv.isMemory()) try self.asmRegisterMemory(
                                 .vcvtph2ps,
                                 wide_reg,
                                 src_mcv.mem(Memory.PtrSize.fromSize(
                                     @intCast(u32, @divExact(wide_reg.bitSize(), 16)),
                                 )),
+                            ) else try self.asmRegisterRegister(
+                                .vcvtph2ps,
+                                wide_reg,
+                                (if (src_mcv.isRegister())
+                                    src_mcv.getReg().?
+                                else
+                                    try self.copyToTmpRegister(ty, src_mcv)).to128(),
                             );
                             try self.asmRegisterRegister(.vsqrtps, wide_reg, wide_reg);
                             try self.asmRegisterRegisterImmediate(
@@ -4617,26 +4660,32 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
         })) |tag| tag else return self.fail("TODO implement airSqrt for {}", .{
             ty.fmt(self.bin_file.options.module.?),
         });
-        switch (tag) {
-            .vsqrtss, .vsqrtsd => if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister(
-                tag,
+        switch (mir_tag) {
+            .vsqrtss, .vsqrtsd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+                mir_tag,
                 dst_reg,
                 dst_reg,
-                registerAlias(src_mcv.getReg().?, abi_size),
-            ) else try self.asmRegisterRegisterMemory(
-                tag,
+                src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
+            ) else try self.asmRegisterRegisterRegister(
+                mir_tag,
                 dst_reg,
                 dst_reg,
-                src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
+                registerAlias(if (src_mcv.isRegister())
+                    src_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(ty, src_mcv), abi_size),
             ),
-            else => if (src_mcv.isRegister()) try self.asmRegisterRegister(
-                tag,
-                dst_reg,
-                registerAlias(src_mcv.getReg().?, abi_size),
-            ) else try self.asmRegisterMemory(
-                tag,
+            else => if (src_mcv.isMemory()) try self.asmRegisterMemory(
+                mir_tag,
                 dst_reg,
                 src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
+            ) else try self.asmRegisterRegister(
+                mir_tag,
+                dst_reg,
+                registerAlias(if (src_mcv.isRegister())
+                    src_mcv.getReg().?
+                else
+                    try self.copyToTmpRegister(ty, src_mcv), abi_size),
             ),
         }
         break :result dst_mcv;
@@ -5800,25 +5849,22 @@ fn genMulDivBinOp(
     }
 }
 
-/// Result is always a register.
 fn genBinOp(
     self: *Self,
     maybe_inst: ?Air.Inst.Index,
-    tag: Air.Inst.Tag,
+    air_tag: Air.Inst.Tag,
     lhs_air: Air.Inst.Ref,
     rhs_air: Air.Inst.Ref,
 ) !MCValue {
-    const lhs = try self.resolveInst(lhs_air);
-    const rhs = try self.resolveInst(rhs_air);
+    const lhs_mcv = try self.resolveInst(lhs_air);
+    const rhs_mcv = try self.resolveInst(rhs_air);
     const lhs_ty = self.air.typeOf(lhs_air);
     const rhs_ty = self.air.typeOf(rhs_air);
-    if (lhs_ty.zigTypeTag() == .Vector) {
-        return self.fail("TODO implement genBinOp for {}", .{lhs_ty.fmt(self.bin_file.options.module.?)});
-    }
+    const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
 
-    switch (lhs) {
+    switch (lhs_mcv) {
         .immediate => |imm| switch (imm) {
-            0 => switch (tag) {
+            0 => switch (air_tag) {
                 .sub, .subwrap => return self.genUnOp(maybe_inst, .neg, rhs_air),
                 else => {},
             },
@@ -5827,9 +5873,10 @@ fn genBinOp(
         else => {},
     }
 
-    const is_commutative = switch (tag) {
+    const is_commutative = switch (air_tag) {
         .add,
         .addwrap,
+        .mul,
         .bool_or,
         .bit_or,
         .bool_and,
@@ -5841,48 +5888,42 @@ fn genBinOp(
 
         else => false,
     };
-    const dst_mem_ok = switch (tag) {
-        .add,
-        .addwrap,
-        .sub,
-        .subwrap,
-        .mul,
-        .div_float,
-        .div_exact,
-        .div_trunc,
-        .div_floor,
-        => !lhs_ty.isRuntimeFloat(),
-
-        else => true,
+    const vec_op = switch (lhs_ty.zigTypeTag()) {
+        else => false,
+        .Float, .Vector => true,
     };
 
-    const lhs_lock: ?RegisterLock = switch (lhs) {
+    const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
         .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
         else => null,
     };
     defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const rhs_lock: ?RegisterLock = switch (rhs) {
+    const rhs_lock: ?RegisterLock = switch (rhs_mcv) {
         .register => |reg| self.register_manager.lockReg(reg),
         else => null,
     };
     defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
 
-    var flipped: bool = false;
+    var flipped = false;
+    var copied_to_dst = true;
     const dst_mcv: MCValue = dst: {
         if (maybe_inst) |inst| {
-            if ((dst_mem_ok or lhs.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs)) {
-                break :dst lhs;
+            if ((!vec_op or lhs_mcv.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs_mcv)) {
+                break :dst lhs_mcv;
             }
-            if (is_commutative and (dst_mem_ok or rhs.isRegister()) and
-                self.reuseOperand(inst, rhs_air, 1, rhs))
+            if (is_commutative and (!vec_op or rhs_mcv.isRegister()) and
+                self.reuseOperand(inst, rhs_air, 1, rhs_mcv))
             {
                 flipped = true;
-                break :dst rhs;
+                break :dst rhs_mcv;
             }
         }
         const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, maybe_inst, true);
-        try self.genCopy(lhs_ty, dst_mcv, lhs);
+        if (vec_op and lhs_mcv.isRegister() and self.hasFeature(.avx))
+            copied_to_dst = false
+        else
+            try self.genCopy(lhs_ty, dst_mcv, lhs_mcv);
         break :dst dst_mcv;
     };
     const dst_lock: ?RegisterLock = switch (dst_mcv) {
@@ -5891,160 +5932,47 @@ fn genBinOp(
     };
     defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const src_mcv = if (flipped) lhs else rhs;
-    switch (tag) {
-        .add,
-        .addwrap,
-        => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
-            else => .add,
-            .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
-                    .addss
-                else
-                    return self.fail("TODO implement genBinOp for {s} {} without sse", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
-                    .addsd
-                else
-                    return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                }),
-            },
-        }, lhs_ty, dst_mcv, src_mcv),
-
-        .sub,
-        .subwrap,
-        => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
-            else => .sub,
-            .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
-                    .subss
-                else
-                    return self.fail("TODO implement genBinOp for {s} {} without sse", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
-                    .subsd
-                else
-                    return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                }),
-            },
-        }, lhs_ty, dst_mcv, src_mcv),
-
-        .mul => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
-            else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-            }),
-            .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
-                    .mulss
-                else
-                    return self.fail("TODO implement genBinOp for {s} {} without sse", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
-                    .mulsd
-                else
-                    return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                }),
-            },
-        }, lhs_ty, dst_mcv, src_mcv),
+    const src_mcv = if (flipped) lhs_mcv else rhs_mcv;
+    if (!vec_op) {
+        switch (air_tag) {
+            .add,
+            .addwrap,
+            => try self.genBinOpMir(.add, lhs_ty, dst_mcv, src_mcv),
 
-        .div_float,
-        .div_exact,
-        .div_trunc,
-        .div_floor,
-        => {
-            try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
-                else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                }),
-                .Float => switch (lhs_ty.floatBits(self.target.*)) {
-                    32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
-                        .divss
-                    else
-                        return self.fail("TODO implement genBinOp for {s} {} without sse", .{
-                            @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                        }),
-                    64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
-                        .divsd
-                    else
-                        return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
-                            @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                        }),
-                    else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                    }),
-                },
-            }, lhs_ty, dst_mcv, src_mcv);
-            switch (tag) {
-                .div_float,
-                .div_exact,
-                => {},
-                .div_trunc,
-                .div_floor,
-                => if (self.hasFeature(.sse4_1)) {
-                    const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
-                    const dst_alias = registerAlias(dst_mcv.register, abi_size);
-                    try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) {
-                        32 => .roundss,
-                        64 => .roundsd,
-                        else => unreachable,
-                    }, dst_alias, dst_alias, Immediate.u(switch (tag) {
-                        .div_trunc => 0b1_0_11,
-                        .div_floor => 0b1_0_01,
-                        else => unreachable,
-                    }));
-                } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{
-                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                }),
-                else => unreachable,
-            }
-        },
+            .sub,
+            .subwrap,
+            => try self.genBinOpMir(.sub, lhs_ty, dst_mcv, src_mcv),
 
-        .ptr_add,
-        .ptr_sub,
-        => {
-            const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv);
-            const tmp_mcv = MCValue{ .register = tmp_reg };
-            const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
-            defer self.register_manager.unlockReg(tmp_lock);
+            .ptr_add,
+            .ptr_sub,
+            => {
+                const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv);
+                const tmp_mcv = MCValue{ .register = tmp_reg };
+                const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+                defer self.register_manager.unlockReg(tmp_lock);
 
-            const elem_size = lhs_ty.elemType2().abiSize(self.target.*);
-            try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size });
-            try self.genBinOpMir(switch (tag) {
-                .ptr_add => .add,
-                .ptr_sub => .sub,
-                else => unreachable,
-            }, lhs_ty, dst_mcv, tmp_mcv);
-        },
+                const elem_size = lhs_ty.elemType2().abiSize(self.target.*);
+                try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size });
+                try self.genBinOpMir(switch (air_tag) {
+                    .ptr_add => .add,
+                    .ptr_sub => .sub,
+                    else => unreachable,
+                }, lhs_ty, dst_mcv, tmp_mcv);
+            },
 
-        .bool_or,
-        .bit_or,
-        => try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv),
+            .bool_or,
+            .bit_or,
+            => try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv),
 
-        .bool_and,
-        .bit_and,
-        => try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv),
+            .bool_and,
+            .bit_and,
+            => try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv),
 
-        .xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv),
+            .xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv),
 
-        .min,
-        .max,
-        => switch (lhs_ty.zigTypeTag()) {
-            .Int => {
+            .min,
+            .max,
+            => {
                 const mat_src_mcv: MCValue = if (switch (src_mcv) {
                     .immediate,
                     .eflags,
@@ -6070,12 +5998,12 @@ fn genBinOp(
 
                 const int_info = lhs_ty.intInfo(self.target.*);
                 const cc: Condition = switch (int_info.signedness) {
-                    .unsigned => switch (tag) {
+                    .unsigned => switch (air_tag) {
                         .min => .a,
                         .max => .b,
                         else => unreachable,
                     },
-                    .signed => switch (tag) {
+                    .signed => switch (air_tag) {
                         .min => .g,
                         .max => .l,
                         else => unreachable,
@@ -6134,26 +6062,222 @@ fn genBinOp(
                 }
                 try self.genCopy(lhs_ty, dst_mcv, .{ .register = tmp_reg });
             },
-            .Float => try self.genBinOpMir(switch (lhs_ty.floatBits(self.target.*)) {
-                32 => switch (tag) {
-                    .min => .minss,
-                    .max => .maxss,
-                    else => unreachable,
-                },
-                64 => switch (tag) {
-                    .min => .minsd,
-                    .max => .maxsd,
-                    else => unreachable,
-                },
-                else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
-                }),
-            }, lhs_ty, dst_mcv, src_mcv),
+
             else => return self.fail("TODO implement genBinOp for {s} {}", .{
-                @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
             }),
-        },
+        }
+        return dst_mcv;
+    }
 
+    const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
+        else => unreachable,
+        .Float => switch (lhs_ty.floatBits(self.target.*)) {
+            32 => switch (air_tag) {
+                .add => if (self.hasFeature(.avx)) .vaddss else .addss,
+                .sub => if (self.hasFeature(.avx)) .vsubss else .subss,
+                .mul => if (self.hasFeature(.avx)) .vmulss else .mulss,
+                .div_float,
+                .div_trunc,
+                .div_floor,
+                .div_exact,
+                => if (self.hasFeature(.avx)) .vdivss else .divss,
+                .max => if (self.hasFeature(.avx)) .vmaxss else .maxss,
+                .min => if (self.hasFeature(.avx)) .vminss else .minss,
+                else => unreachable,
+            },
+            64 => switch (air_tag) {
+                .add => if (self.hasFeature(.avx)) .vaddsd else .addsd,
+                .sub => if (self.hasFeature(.avx)) .vsubsd else .subsd,
+                .mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd,
+                .div_float,
+                .div_trunc,
+                .div_floor,
+                .div_exact,
+                => if (self.hasFeature(.avx)) .vdivsd else .divsd,
+                .max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd,
+                .min => if (self.hasFeature(.avx)) .vminsd else .minsd,
+                else => unreachable,
+            },
+            16, 80, 128 => null,
+            else => unreachable,
+        },
+        .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+            else => null,
+            .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                32 => switch (lhs_ty.vectorLen()) {
+                    1 => switch (air_tag) {
+                        .add => if (self.hasFeature(.avx)) .vaddss else .addss,
+                        .sub => if (self.hasFeature(.avx)) .vsubss else .subss,
+                        .mul => if (self.hasFeature(.avx)) .vmulss else .mulss,
+                        .div_float,
+                        .div_trunc,
+                        .div_floor,
+                        .div_exact,
+                        => if (self.hasFeature(.avx)) .vdivss else .divss,
+                        .max => if (self.hasFeature(.avx)) .vmaxss else .maxss,
+                        .min => if (self.hasFeature(.avx)) .vminss else .minss,
+                        else => unreachable,
+                    },
+                    2...4 => switch (air_tag) {
+                        .add => if (self.hasFeature(.avx)) .vaddps else .addps,
+                        .sub => if (self.hasFeature(.avx)) .vsubps else .subps,
+                        .mul => if (self.hasFeature(.avx)) .vmulps else .mulps,
+                        .div_float,
+                        .div_trunc,
+                        .div_floor,
+                        .div_exact,
+                        => if (self.hasFeature(.avx)) .vdivps else .divps,
+                        .max => if (self.hasFeature(.avx)) .vmaxps else .maxps,
+                        .min => if (self.hasFeature(.avx)) .vminps else .minps,
+                        else => unreachable,
+                    },
+                    5...8 => if (self.hasFeature(.avx)) switch (air_tag) {
+                        .add => .vaddps,
+                        .sub => .vsubps,
+                        .mul => .vmulps,
+                        .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+                        .max => .vmaxps,
+                        .min => .vminps,
+                        else => unreachable,
+                    } else null,
+                    else => null,
+                },
+                64 => switch (lhs_ty.vectorLen()) {
+                    1 => switch (air_tag) {
+                        .add => if (self.hasFeature(.avx)) .vaddsd else .addsd,
+                        .sub => if (self.hasFeature(.avx)) .vsubsd else .subsd,
+                        .mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd,
+                        .div_float,
+                        .div_trunc,
+                        .div_floor,
+                        .div_exact,
+                        => if (self.hasFeature(.avx)) .vdivsd else .divsd,
+                        .max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd,
+                        .min => if (self.hasFeature(.avx)) .vminsd else .minsd,
+                        else => unreachable,
+                    },
+                    2 => switch (air_tag) {
+                        .add => if (self.hasFeature(.avx)) .vaddpd else .addpd,
+                        .sub => if (self.hasFeature(.avx)) .vsubpd else .subpd,
+                        .mul => if (self.hasFeature(.avx)) .vmulpd else .mulpd,
+                        .div_float,
+                        .div_trunc,
+                        .div_floor,
+                        .div_exact,
+                        => if (self.hasFeature(.avx)) .vdivpd else .divpd,
+                        .max => if (self.hasFeature(.avx)) .vmaxpd else .maxpd,
+                        .min => if (self.hasFeature(.avx)) .vminpd else .minpd,
+                        else => unreachable,
+                    },
+                    3...4 => if (self.hasFeature(.avx)) switch (air_tag) {
+                        .add => .vaddpd,
+                        .sub => .vsubpd,
+                        .mul => .vmulpd,
+                        .div_float, .div_trunc, .div_floor, .div_exact => .vdivpd,
+                        .max => .vmaxpd,
+                        .min => .vminpd,
+                        else => unreachable,
+                    } else null,
+                    else => null,
+                },
+                16, 80, 128 => null,
+                else => unreachable,
+            },
+        },
+    })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+        @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+    });
+    const dst_alias = registerAlias(dst_mcv.getReg().?, abi_size);
+    if (self.hasFeature(.avx)) {
+        const src1_alias =
+            if (copied_to_dst) dst_alias else registerAlias(lhs_mcv.getReg().?, abi_size);
+        if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
+            mir_tag,
+            dst_alias,
+            src1_alias,
+            src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
+        ) else try self.asmRegisterRegisterRegister(
+            mir_tag,
+            dst_alias,
+            src1_alias,
+            registerAlias(if (src_mcv.isRegister())
+                src_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
+        );
+    } else {
+        assert(copied_to_dst);
+        if (src_mcv.isMemory()) try self.asmRegisterMemory(
+            mir_tag,
+            dst_alias,
+            src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
+        ) else try self.asmRegisterRegister(
+            mir_tag,
+            dst_alias,
+            registerAlias(if (src_mcv.isRegister())
+                src_mcv.getReg().?
+            else
+                try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
+        );
+    }
+    switch (air_tag) {
+        .add, .sub, .mul, .div_float, .div_exact => {},
+        .div_trunc, .div_floor => if (self.hasFeature(.sse4_1)) {
+            const round_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
+                .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                    32 => if (self.hasFeature(.avx)) .vroundss else .roundss,
+                    64 => if (self.hasFeature(.avx)) .vroundsd else .roundsd,
+                    16, 80, 128 => null,
+                    else => unreachable,
+                },
+                .Vector => switch (lhs_ty.childType().zigTypeTag()) {
+                    .Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+                        32 => switch (lhs_ty.vectorLen()) {
+                            1 => if (self.hasFeature(.avx)) .vroundss else .roundss,
+                            2...4 => if (self.hasFeature(.avx)) .vroundps else .roundps,
+                            5...8 => if (self.hasFeature(.avx)) .vroundps else null,
+                            else => null,
+                        },
+                        64 => switch (lhs_ty.vectorLen()) {
+                            1 => if (self.hasFeature(.avx)) .vroundsd else .roundsd,
+                            2 => if (self.hasFeature(.avx)) .vroundpd else .roundpd,
+                            3...4 => if (self.hasFeature(.avx)) .vroundpd else null,
+                            else => null,
+                        },
+                        16, 80, 128 => null,
+                        else => unreachable,
+                    },
+                    else => null,
+                },
+                else => unreachable,
+            })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
+                @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+            });
+            const round_mode = Immediate.u(switch (air_tag) {
+                .div_trunc => 0b1_0_11,
+                .div_floor => 0b1_0_01,
+                else => unreachable,
+            });
+            switch (round_tag) {
+                .vroundss, .vroundsd => try self.asmRegisterRegisterRegisterImmediate(
+                    round_tag,
+                    dst_alias,
+                    dst_alias,
+                    dst_alias,
+                    round_mode,
+                ),
+                else => try self.asmRegisterRegisterImmediate(
+                    round_tag,
+                    dst_alias,
+                    dst_alias,
+                    round_mode,
+                ),
+            }
+        } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{
+            @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
+        }),
+        .max, .min => {}, // TODO: unordered select
         else => unreachable,
     }
     return dst_mcv;
@@ -6186,20 +6310,11 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, ty: Type, dst_mcv: MCValue, s
                 .register_overflow,
                 .reserved_frame,
                 => unreachable,
-                .register => |src_reg| switch (ty.zigTypeTag()) {
-                    .Float => {
-                        if (!Target.x86.featureSetHas(self.target.cpu.features, .sse))
-                            return self.fail("TODO genBinOpMir for {s} {} without sse", .{
-                                @tagName(mir_tag), ty.fmt(self.bin_file.options.module.?),
-                            });
-                        return self.asmRegisterRegister(mir_tag, dst_reg.to128(), src_reg.to128());
-                    },
-                    else => try self.asmRegisterRegister(
-                        mir_tag,
-                        dst_alias,
-                        registerAlias(src_reg, abi_size),
-                    ),
-                },
+                .register => |src_reg| try self.asmRegisterRegister(
+                    mir_tag,
+                    dst_alias,
+                    registerAlias(src_reg, abi_size),
+                ),
                 .immediate => |imm| switch (self.regBitSize(ty)) {
                     8 => try self.asmRegisterImmediate(
                         mir_tag,
@@ -9646,7 +9761,7 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
         lock.* = self.register_manager.lockRegAssumeUnused(reg);
     }
 
-    const tag = if (@as(
+    const mir_tag = if (@as(
         ?Mir.Inst.Tag,
         if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 }))
             switch (ty.zigTypeTag()) {
@@ -9741,20 +9856,17 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
     const abi_size = @intCast(u32, ty.abiSize(self.target.*));
     const mop1_reg = registerAlias(mops[0].getReg().?, abi_size);
     const mop2_reg = registerAlias(mops[1].getReg().?, abi_size);
-    if (mops[2].isRegister())
-        try self.asmRegisterRegisterRegister(
-            tag,
-            mop1_reg,
-            mop2_reg,
-            registerAlias(mops[2].getReg().?, abi_size),
-        )
-    else
-        try self.asmRegisterRegisterMemory(
-            tag,
-            mop1_reg,
-            mop2_reg,
-            mops[2].mem(Memory.PtrSize.fromSize(abi_size)),
-        );
+    if (mops[2].isRegister()) try self.asmRegisterRegisterRegister(
+        mir_tag,
+        mop1_reg,
+        mop2_reg,
+        registerAlias(mops[2].getReg().?, abi_size),
+    ) else try self.asmRegisterRegisterMemory(
+        mir_tag,
+        mop1_reg,
+        mop2_reg,
+        mops[2].mem(Memory.PtrSize.fromSize(abi_size)),
+    );
     return self.finishAir(inst, mops[0], ops);
 }
 
src/arch/x86_64/Encoding.zig
@@ -262,61 +262,69 @@ pub const Mnemonic = enum {
     // MMX
     movd,
     // SSE
-    addss,
+    addps, addss,
     andps,
     andnps,
     cmpss,
     cvtsi2ss,
-    divss,
-    maxss, minss,
+    divps, divss,
+    maxps, maxss,
+    minps, minss,
     movaps, movss, movups,
-    mulss,
+    mulps, mulss,
     orps,
     pextrw, pinsrw,
-    sqrtps,
-    sqrtss,
-    subss,
+    sqrtps, sqrtss,
+    subps, subss,
     ucomiss,
     xorps,
     // SSE2
-    addsd,
+    addpd, addsd,
     andpd,
     andnpd,
     //cmpsd,
     cvtsd2ss, cvtsi2sd, cvtss2sd,
-    divsd,
-    maxsd, minsd,
+    divpd, divsd,
+    maxpd, maxsd,
+    minpd, minsd,
     movapd,
     movq, //movd, movsd,
     movupd,
-    mulsd,
+    mulpd, mulsd,
     orpd,
     pshufhw, pshuflw,
     psrld, psrlq, psrlw,
     punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
     punpcklbw, punpckldq, punpcklqdq, punpcklwd,
     sqrtpd, sqrtsd,
-    subsd,
+    subpd, subsd,
     ucomisd,
     xorpd,
     // SSE3
     movddup, movshdup, movsldup,
     // SSE4.1
-    roundsd, roundss,
+    roundpd, roundps, roundsd, roundss,
     // AVX
+    vaddpd, vaddps, vaddsd, vaddss,
     vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd,
+    vdivpd, vdivps, vdivsd, vdivss,
+    vmaxpd, vmaxps, vmaxsd, vmaxss,
+    vminpd, vminps, vminsd, vminss,
     vmovapd, vmovaps,
     vmovddup,
     vmovsd,
     vmovshdup, vmovsldup,
     vmovss,
     vmovupd, vmovups,
+    vmulpd, vmulps, vmulsd, vmulss,
     vpextrw, vpinsrw,
     vpshufhw, vpshuflw,
     vpsrld, vpsrlq, vpsrlw,
     vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
     vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd,
+    vroundpd, vroundps, vroundsd, vroundss,
     vsqrtpd, vsqrtps, vsqrtsd, vsqrtss,
+    vsubpd, vsubps, vsubsd, vsubss,
     // F16C
     vcvtph2ps, vcvtps2ph,
     // FMA
src/arch/x86_64/encodings.zig
@@ -837,6 +837,8 @@ pub const table = [_]Entry{
     .{ .xor, .rm, &.{ .r64,  .rm64   }, &.{ 0x33 }, 0, .long,  .none },
 
     // SSE
+    .{ .addps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .none, .sse },
+
     .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .none, .sse },
 
     .{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .none, .sse },
@@ -848,10 +850,16 @@ pub const table = [_]Entry{
     .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .none, .sse },
     .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .long, .sse },
 
+    .{ .divps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .none, .sse },
+
     .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .none, .sse },
 
+    .{ .maxps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .none, .sse },
+
     .{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .none, .sse },
 
+    .{ .minps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .none, .sse },
+
     .{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .none, .sse },
 
     .{ .movaps, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse },
@@ -863,10 +871,14 @@ pub const table = [_]Entry{
     .{ .movups, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .none, .sse },
     .{ .movups, .mr, &.{ .xmm_m128, .xmm      }, &.{ 0x0f, 0x11 }, 0, .none, .sse },
 
+    .{ .mulps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .none, .sse },
+
     .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .none, .sse },
 
     .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse },
 
+    .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse },
+
     .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse },
 
     .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },
@@ -878,6 +890,8 @@ pub const table = [_]Entry{
     .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse },
 
     // SSE2
+    .{ .addpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .none, .sse2 },
+
     .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .none, .sse2 },
 
     .{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .none, .sse2 },
@@ -893,10 +907,16 @@ pub const table = [_]Entry{
 
     .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .none, .sse2 },
 
+    .{ .divpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .none, .sse2 },
+
     .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .none, .sse2 },
 
+    .{ .maxpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .none, .sse2 },
+
     .{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .none, .sse2 },
 
+    .{ .minpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .none, .sse2 },
+
     .{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .none, .sse2 },
 
     .{ .movapd, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .none, .sse2 },
@@ -914,6 +934,8 @@ pub const table = [_]Entry{
     .{ .movupd, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .none, .sse2 },
     .{ .movupd, .mr, &.{ .xmm_m128, .xmm      }, &.{ 0x66, 0x0f, 0x11 }, 0, .none, .sse2 },
 
+    .{ .mulpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .none, .sse2 },
+
     .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .none, .sse2 },
 
     .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 },
@@ -947,6 +969,8 @@ pub const table = [_]Entry{
 
     .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 },
 
+    .{ .subpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .none, .sse2 },
+
     .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .none, .sse2 },
 
     .{ .movsd, .rm, &.{ .xmm,     .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .none, .sse2 },
@@ -966,10 +990,25 @@ pub const table = [_]Entry{
     // SSE4.1
     .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },
 
-    .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 },
+    .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
+
+    .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 },
+
     .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 },
 
+    .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 },
+
     // AVX
+    .{ .vaddpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_128_wig, .avx },
+    .{ .vaddpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_256_wig, .avx },
+
+    .{ .vaddps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .vex_128_wig, .avx },
+    .{ .vaddps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x58 }, 0, .vex_256_wig, .avx },
+
+    .{ .vaddsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
+
+    .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
+
     .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx },
 
     .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx },
@@ -980,6 +1019,36 @@ pub const table = [_]Entry{
 
     .{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx },
 
+    .{ .vdivpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_128_wig, .avx },
+    .{ .vdivpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_256_wig, .avx },
+
+    .{ .vdivps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .vex_128_wig, .avx },
+    .{ .vdivps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5e }, 0, .vex_256_wig, .avx },
+
+    .{ .vdivsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx },
+
+    .{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx },
+
+    .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
+    .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
+
+    .{ .vmaxps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
+    .{ .vmaxps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
+
+    .{ .vmaxsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx },
+
+    .{ .vmaxss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx },
+
+    .{ .vminpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_128_wig, .avx },
+    .{ .vminpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_256_wig, .avx },
+
+    .{ .vminps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .vex_128_wig, .avx },
+    .{ .vminps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5d }, 0, .vex_256_wig, .avx },
+
+    .{ .vminsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx },
+
+    .{ .vminss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx },
+
     .{ .vmovapd, .rm, &.{ .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128_wig, .avx },
     .{ .vmovapd, .mr, &.{ .xmm_m128, .xmm      }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128_wig, .avx },
     .{ .vmovapd, .rm, &.{ .ymm,      .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256_wig, .avx },
@@ -1019,6 +1088,16 @@ pub const table = [_]Entry{
     .{ .vmovups, .rm, &.{ .ymm,      .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256_wig, .avx },
     .{ .vmovups, .mr, &.{ .ymm_m256, .ymm      }, &.{ 0x0f, 0x11 }, 0, .vex_256_wig, .avx },
 
+    .{ .vmulpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_128_wig, .avx },
+    .{ .vmulpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_256_wig, .avx },
+
+    .{ .vmulps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .vex_128_wig, .avx },
+    .{ .vmulps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x59 }, 0, .vex_256_wig, .avx },
+
+    .{ .vmulsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
+
+    .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
+
     .{ .vpextrw, .rmi, &.{ .r32,     .xmm, .imm8 }, &.{ 0x66, 0x0f,       0x15 }, 0, .vex_128_wig, .avx },
     .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx },
 
@@ -1041,6 +1120,16 @@ pub const table = [_]Entry{
     .{ .vpunpckldq,  .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx },
     .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx },
 
+    .{ .vroundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_128_wig, .avx },
+    .{ .vroundpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_256_wig, .avx },
+
+    .{ .vroundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_128_wig, .avx },
+    .{ .vroundps, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_256_wig, .avx },
+
+    .{ .vroundsd, .rvmi, &.{ .xmm, .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .vex_lig_wig, .avx },
+
+    .{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx },
+
     .{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx },
     .{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx },
 
@@ -1051,6 +1140,16 @@ pub const table = [_]Entry{
 
     .{ .vsqrtss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .vex_lig_wig, .avx },
 
+    .{ .vsubpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_128_wig, .avx },
+    .{ .vsubpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_256_wig, .avx },
+
+    .{ .vsubps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .vex_128_wig, .avx },
+    .{ .vsubps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5c }, 0, .vex_256_wig, .avx },
+
+    .{ .vsubsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx },
+
+    .{ .vsubss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx },
+
     // F16C
     .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c },
     .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c },
src/arch/x86_64/Lower.zig
@@ -124,27 +124,34 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .xchg,
         .xor,
 
+        .addps,
         .addss,
         .andnps,
         .andps,
         .cmpss,
         .cvtsi2ss,
+        .divps,
         .divss,
+        .maxps,
         .maxss,
+        .minps,
         .minss,
         .movaps,
         .movss,
         .movups,
+        .mulps,
         .mulss,
         .orps,
         .pextrw,
         .pinsrw,
         .sqrtps,
         .sqrtss,
+        .subps,
         .subss,
         .ucomiss,
         .xorps,
 
+        .addpd,
         .addsd,
         .andnpd,
         .andpd,
@@ -152,10 +159,14 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .cvtsd2ss,
         .cvtsi2sd,
         .cvtss2sd,
+        .divpd,
         .divsd,
+        .maxpd,
         .maxsd,
+        .minpd,
         .minsd,
         .movsd,
+        .mulpd,
         .mulsd,
         .orpd,
         .pshufhw,
@@ -173,6 +184,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .punpcklwd,
         .sqrtpd,
         .sqrtsd,
+        .subpd,
         .subsd,
         .ucomisd,
         .xorpd,
@@ -181,13 +193,31 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .movshdup,
         .movsldup,
 
+        .roundpd,
+        .roundps,
         .roundsd,
         .roundss,
 
+        .vaddpd,
+        .vaddps,
+        .vaddsd,
+        .vaddss,
         .vcvtsd2ss,
         .vcvtsi2sd,
         .vcvtsi2ss,
         .vcvtss2sd,
+        .vdivpd,
+        .vdivps,
+        .vdivsd,
+        .vdivss,
+        .vmaxpd,
+        .vmaxps,
+        .vmaxsd,
+        .vmaxss,
+        .vminpd,
+        .vminps,
+        .vminsd,
+        .vminss,
         .vmovapd,
         .vmovaps,
         .vmovddup,
@@ -197,6 +227,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .vmovss,
         .vmovupd,
         .vmovups,
+        .vmulpd,
+        .vmulps,
+        .vmulsd,
+        .vmulss,
         .vpextrw,
         .vpinsrw,
         .vpshufhw,
@@ -212,10 +246,18 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
         .vpunpckldq,
         .vpunpcklqdq,
         .vpunpcklwd,
+        .vroundpd,
+        .vroundps,
+        .vroundsd,
+        .vroundss,
         .vsqrtpd,
         .vsqrtps,
         .vsqrtsd,
         .vsqrtss,
+        .vsubpd,
+        .vsubps,
+        .vsubsd,
+        .vsubss,
 
         .vcvtph2ps,
         .vcvtps2ph,
@@ -304,6 +346,7 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate {
         .lock_mi_rip_s,
         => Immediate.s(@bitCast(i32, i)),
 
+        .rrri,
         .rri_u,
         .ri_u,
         .i_u,
@@ -429,6 +472,12 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void {
             .{ .reg = inst.data.rrr.r2 },
             .{ .reg = inst.data.rrr.r3 },
         },
+        .rrri => &.{
+            .{ .reg = inst.data.rrri.r1 },
+            .{ .reg = inst.data.rrri.r2 },
+            .{ .reg = inst.data.rrri.r3 },
+            .{ .imm = lower.imm(inst.ops, inst.data.rrri.i) },
+        },
         .ri_s, .ri_u => &.{
             .{ .reg = inst.data.ri.r },
             .{ .imm = lower.imm(inst.ops, inst.data.ri.i) },
src/arch/x86_64/Mir.zig
@@ -166,7 +166,9 @@ pub const Inst = struct {
         /// Logical exclusive-or
         xor,
 
-        /// Add single precision floating point values
+        /// Add packed single-precision floating-point values
+        addps,
+        /// Add scalar single-precision floating-point values
         addss,
         /// Bitwise logical and of packed single precision floating-point values
         andps,
@@ -176,11 +178,17 @@ pub const Inst = struct {
         cmpss,
         /// Convert doubleword integer to scalar single-precision floating-point value
         cvtsi2ss,
+        /// Divide packed single-precision floating-point values
+        divps,
         /// Divide scalar single-precision floating-point values
         divss,
-        /// Return maximum single-precision floating-point value
+        /// Maximum of packed single-precision floating-point values
+        maxps,
+        /// Maximum of scalar single-precision floating-point values
         maxss,
-        /// Return minimum single-precision floating-point value
+        /// Minimum of packed single-precision floating-point values
+        minps,
+        /// Minimum of scalar single-precision floating-point values
         minss,
         /// Move aligned packed single-precision floating-point values
         movaps,
@@ -188,6 +196,8 @@ pub const Inst = struct {
         movss,
         /// Move unaligned packed single-precision floating-point values
         movups,
+        /// Multiply packed single-precision floating-point values
+        mulps,
         /// Multiply scalar single-precision floating-point values
         mulss,
         /// Bitwise logical or of packed single precision floating-point values
@@ -196,18 +206,22 @@ pub const Inst = struct {
         pextrw,
         /// Insert word
         pinsrw,
-        /// Square root of scalar single precision floating-point value
+        /// Square root of packed single-precision floating-point values
         sqrtps,
-        /// Subtract scalar single-precision floating-point values
+        /// Square root of scalar single-precision floating-point value
         sqrtss,
-        /// Square root of single precision floating-point values
+        /// Subtract packed single-precision floating-point values
+        subps,
+        /// Subtract scalar single-precision floating-point values
         subss,
         /// Unordered compare scalar single-precision floating-point values
         ucomiss,
         /// Bitwise logical xor of packed single precision floating-point values
         xorps,
 
-        /// Add double precision floating point values
+        /// Add packed double-precision floating-point values
+        addpd,
+        /// Add scalar double-precision floating-point values
         addsd,
         /// Bitwise logical and not of packed double precision floating-point values
         andnpd,
@@ -221,14 +235,22 @@ pub const Inst = struct {
         cvtsi2sd,
         /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
         cvtss2sd,
+        /// Divide packed double-precision floating-point values
+        divpd,
         /// Divide scalar double-precision floating-point values
         divsd,
-        /// Return maximum double-precision floating-point value
+        /// Maximum of packed double-precision floating-point values
+        maxpd,
+        /// Maximum of scalar double-precision floating-point values
         maxsd,
-        /// Return minimum double-precision floating-point value
+        /// Minimum of packed double-precision floating-point values
+        minpd,
+        /// Minimum of scalar double-precision floating-point values
         minsd,
         /// Move scalar double-precision floating-point value
         movsd,
+        /// Multiply packed double-precision floating-point values
+        mulpd,
         /// Multiply scalar double-precision floating-point values
         mulsd,
         /// Bitwise logical or of packed double precision floating-point values
@@ -263,6 +285,8 @@ pub const Inst = struct {
         sqrtpd,
         /// Square root of scalar double precision floating-point value
         sqrtsd,
+        /// Subtract packed double-precision floating-point values
+        subpd,
         /// Subtract scalar double-precision floating-point values
         subsd,
         /// Unordered compare scalar double-precision floating-point values
@@ -277,11 +301,23 @@ pub const Inst = struct {
         /// Replicate single floating-point values
         movsldup,
 
-        /// Round scalar double-precision floating-point values
+        /// Round packed double-precision floating-point values
+        roundpd,
+        /// Round packed single-precision floating-point values
+        roundps,
+        /// Round scalar double-precision floating-point value
         roundsd,
-        /// Round scalar single-precision floating-point values
+        /// Round scalar single-precision floating-point value
         roundss,
 
+        /// Add packed double-precision floating-point values
+        vaddpd,
+        /// Add packed single-precision floating-point values
+        vaddps,
+        /// Add scalar double-precision floating-point values
+        vaddsd,
+        /// Add scalar single-precision floating-point values
+        vaddss,
         /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value
         vcvtsd2ss,
         /// Convert doubleword integer to scalar double-precision floating-point value
@@ -290,6 +326,30 @@ pub const Inst = struct {
         vcvtsi2ss,
         /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
         vcvtss2sd,
+        /// Divide packed double-precision floating-point values
+        vdivpd,
+        /// Divide packed single-precision floating-point values
+        vdivps,
+        /// Divide scalar double-precision floating-point values
+        vdivsd,
+        /// Divide scalar single-precision floating-point values
+        vdivss,
+        /// Maximum of packed double-precision floating-point values
+        vmaxpd,
+        /// Maximum of packed single-precision floating-point values
+        vmaxps,
+        /// Maximum of scalar double-precision floating-point values
+        vmaxsd,
+        /// Maximum of scalar single-precision floating-point values
+        vmaxss,
+        /// Minimum of packed double-precision floating-point values
+        vminpd,
+        /// Minimum of packed single-precision floating-point values
+        vminps,
+        /// Minimum of scalar double-precision floating-point values
+        vminsd,
+        /// Minimum of scalar single-precision floating-point values
+        vminss,
         /// Move aligned packed double-precision floating-point values
         vmovapd,
         /// Move aligned packed single-precision floating-point values
@@ -308,6 +368,14 @@ pub const Inst = struct {
         vmovupd,
         /// Move unaligned packed single-precision floating-point values
         vmovups,
+        /// Multiply packed double-precision floating-point values
+        vmulpd,
+        /// Multiply packed single-precision floating-point values
+        vmulps,
+        /// Multiply scalar double-precision floating-point values
+        vmulsd,
+        /// Multiply scalar single-precision floating-point values
+        vmulss,
         /// Extract word
         vpextrw,
         /// Insert word
@@ -338,6 +406,14 @@ pub const Inst = struct {
         vpunpcklqdq,
         /// Unpack low data
         vpunpcklwd,
+        /// Round packed double-precision floating-point values
+        vroundpd,
+        /// Round packed single-precision floating-point values
+        vroundps,
+        /// Round scalar double-precision floating-point value
+        vroundsd,
+        /// Round scalar single-precision floating-point value
+        vroundss,
         /// Square root of packed double-precision floating-point value
         vsqrtpd,
         /// Square root of packed single-precision floating-point value
@@ -346,6 +422,14 @@ pub const Inst = struct {
         vsqrtsd,
         /// Square root of scalar single-precision floating-point value
         vsqrtss,
+        /// Subtract packed double-precision floating-point values
+        vsubpd,
+        /// Subtract packed single-precision floating-point values
+        vsubps,
+        /// Subtract scalar double-precision floating-point values
+        vsubsd,
+        /// Subtract scalar single-precision floating-point values
+        vsubss,
 
         /// Convert 16-bit floating-point values to single-precision floating-point values
         vcvtph2ps,
@@ -442,6 +526,9 @@ pub const Inst = struct {
         /// Register, register, register operands.
         /// Uses `rrr` payload.
         rrr,
+        /// Register, register, register, immediate (byte) operands.
+        /// Uses `rrri` payload.
+        rrri,
         /// Register, register, immediate (sign-extended) operands.
         /// Uses `rri`  payload.
         rri_s,
@@ -625,6 +712,12 @@ pub const Inst = struct {
             r2: Register,
             r3: Register,
         },
+        rrri: struct {
+            r1: Register,
+            r2: Register,
+            r3: Register,
+            i: u8,
+        },
         rri: struct {
             r1: Register,
             r2: Register,