Commit 274654d73e

Jakub Konka <kubkon@jakubkonka.com>
2022-05-20 13:00:59
x64: implement matching SSE instructions for generic cross-comp target
1 parent 0e43d00
Changed files (4)
src/arch/x86_64/bits.zig
@@ -441,6 +441,17 @@ pub const Encoder = struct {
         self.code.appendAssumeCapacity(opcode);
     }
 
+    /// Encodes a 3 byte opcode
+    ///
+    /// e.g. MOVSD has the opcode 0xf2 0x0f 0x10
+    ///
+    /// encoder.opcode_3byte(0xf2, 0x0f, 0x10);
+    pub fn opcode_3byte(self: Self, prefix_1: u8, prefix_2: u8, opcode: u8) void {
+        self.code.appendAssumeCapacity(prefix_1);
+        self.code.appendAssumeCapacity(prefix_2);
+        self.code.appendAssumeCapacity(opcode);
+    }
+
     /// Encodes a 1 byte opcode with a reg field
     ///
     /// Remember to add a REX prefix byte if reg is extended!
src/arch/x86_64/CodeGen.zig
@@ -881,7 +881,7 @@ fn allocRegOrMem(self: *Self, inst: Air.Inst.Index, reg_ok: bool) !MCValue {
         switch (elem_ty.zigTypeTag()) {
             .Vector => return self.fail("TODO allocRegOrMem for Vector type", .{}),
             .Float => {
-                if (self.intrinsicsAllowed(elem_ty)) {
+                if (intrinsicsAllowed(self.target.*, elem_ty)) {
                     const ptr_bytes: u64 = 32;
                     if (abi_size <= ptr_bytes) {
                         if (self.register_manager.tryAllocReg(inst, sse)) |reg| {
@@ -970,7 +970,7 @@ pub fn spillRegisters(self: *Self, comptime count: comptime_int, registers: [cou
 fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register {
     const reg_class: RegisterManager.RegisterBitSet = switch (ty.zigTypeTag()) {
         .Float => blk: {
-            if (self.intrinsicsAllowed(ty)) break :blk sse;
+            if (intrinsicsAllowed(self.target.*, ty)) break :blk sse;
             return self.fail("TODO copy {} to register", .{ty.fmtDebug()});
         },
         else => gp,
@@ -987,7 +987,7 @@ fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register {
 fn copyToRegisterWithInstTracking(self: *Self, reg_owner: Air.Inst.Index, ty: Type, mcv: MCValue) !MCValue {
     const reg_class: RegisterManager.RegisterBitSet = switch (ty.zigTypeTag()) {
         .Float => blk: {
-            if (self.intrinsicsAllowed(ty)) break :blk sse;
+            if (intrinsicsAllowed(self.target.*, ty)) break :blk sse;
             return self.fail("TODO copy {} to register", .{ty.fmtDebug()});
         },
         else => gp,
@@ -3462,16 +3462,28 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, dst_ty: Type, dst_mcv: MCValu
                 },
                 .register => |src_reg| switch (dst_ty.zigTypeTag()) {
                     .Float => {
-                        if (self.intrinsicsAllowed(dst_ty)) {
+                        if (intrinsicsAllowed(self.target.*, dst_ty)) {
                             const actual_tag: Mir.Inst.Tag = switch (dst_ty.tag()) {
                                 .f32 => switch (mir_tag) {
-                                    .add => Mir.Inst.Tag.add_f32_avx,
-                                    .cmp => Mir.Inst.Tag.cmp_f32_avx,
+                                    .add => if (hasAvxSupport(self.target.*))
+                                        Mir.Inst.Tag.add_f32_avx
+                                    else
+                                        Mir.Inst.Tag.add_f32_sse,
+                                    .cmp => if (hasAvxSupport(self.target.*))
+                                        Mir.Inst.Tag.cmp_f32_avx
+                                    else
+                                        Mir.Inst.Tag.cmp_f32_sse,
                                     else => return self.fail("TODO genBinOpMir for f32 register-register with MIR tag {}", .{mir_tag}),
                                 },
                                 .f64 => switch (mir_tag) {
-                                    .add => Mir.Inst.Tag.add_f64_avx,
-                                    .cmp => Mir.Inst.Tag.cmp_f64_avx,
+                                    .add => if (hasAvxSupport(self.target.*))
+                                        Mir.Inst.Tag.add_f64_avx
+                                    else
+                                        Mir.Inst.Tag.add_f64_sse,
+                                    .cmp => if (hasAvxSupport(self.target.*))
+                                        Mir.Inst.Tag.cmp_f64_avx
+                                    else
+                                        Mir.Inst.Tag.cmp_f64_sse,
                                     else => return self.fail("TODO genBinOpMir for f64 register-register with MIR tag {}", .{mir_tag}),
                                 },
                                 else => return self.fail("TODO genBinOpMir for float register-register and type {}", .{dst_ty.fmtDebug()}),
@@ -5324,10 +5336,16 @@ fn genSetStackArg(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerE
         .register => |reg| {
             switch (ty.zigTypeTag()) {
                 .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                         const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
-                            .f64 => .mov_f64_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f32_avx
+                            else
+                                Mir.Inst.Tag.mov_f32_sse,
+                            .f64 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f64_avx
+                            else
+                                Mir.Inst.Tag.mov_f64_sse,
                             else => return self.fail("TODO genSetStackArg for register for type {}", .{ty.fmtDebug()}),
                         };
                         _ = try self.addInst(.{
@@ -5508,10 +5526,16 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue, opts: Inl
 
             switch (ty.zigTypeTag()) {
                 .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                         const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
-                            .f64 => .mov_f64_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f32_avx
+                            else
+                                Mir.Inst.Tag.mov_f32_sse,
+                            .f64 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f64_avx
+                            else
+                                Mir.Inst.Tag.mov_f64_sse,
                             else => return self.fail("TODO genSetStack for register for type {}", .{ty.fmtDebug()}),
                         };
                         _ = try self.addInst(.{
@@ -6032,10 +6056,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     },
                 },
                 .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                         const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
-                            .f64 => .mov_f64_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f32_avx
+                            else
+                                Mir.Inst.Tag.mov_f32_sse,
+                            .f64 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f64_avx
+                            else
+                                Mir.Inst.Tag.mov_f64_sse,
                             else => return self.fail("TODO genSetReg from register for {}", .{ty.fmtDebug()}),
                         };
                         _ = try self.addInst(.{
@@ -6072,10 +6102,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     const base_reg = try self.register_manager.allocReg(null, gp);
                     try self.loadMemPtrIntoRegister(base_reg, Type.usize, mcv);
 
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                         const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
-                            .f64 => .mov_f64_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f32_avx
+                            else
+                                Mir.Inst.Tag.mov_f32_sse,
+                            .f64 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f64_avx
+                            else
+                                Mir.Inst.Tag.mov_f64_sse,
                             else => return self.fail("TODO genSetReg from memory for {}", .{ty.fmtDebug()}),
                         };
 
@@ -6115,10 +6151,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                 const base_reg = try self.register_manager.allocReg(null, gp);
                 try self.loadMemPtrIntoRegister(base_reg, Type.usize, mcv);
 
-                if (self.intrinsicsAllowed(ty)) {
+                if (intrinsicsAllowed(self.target.*, ty)) {
                     const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                        .f32 => .mov_f32_avx,
-                        .f64 => .mov_f64_avx,
+                        .f32 => if (hasAvxSupport(self.target.*))
+                            Mir.Inst.Tag.mov_f32_avx
+                        else
+                            Mir.Inst.Tag.mov_f32_sse,
+                        .f64 => if (hasAvxSupport(self.target.*))
+                            Mir.Inst.Tag.mov_f64_avx
+                        else
+                            Mir.Inst.Tag.mov_f64_sse,
                         else => return self.fail("TODO genSetReg from memory for {}", .{ty.fmtDebug()}),
                     };
 
@@ -6230,10 +6272,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     },
                 },
                 .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                         const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
-                            .f64 => .mov_f64_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f32_avx
+                            else
+                                Mir.Inst.Tag.mov_f32_sse,
+                            .f64 => if (hasAvxSupport(self.target.*))
+                                Mir.Inst.Tag.mov_f64_avx
+                            else
+                                Mir.Inst.Tag.mov_f64_sse,
                             else => return self.fail("TODO genSetReg from stack offset for {}", .{ty.fmtDebug()}),
                         };
                         _ = try self.addInst(.{
@@ -7046,11 +7094,15 @@ fn truncateRegister(self: *Self, ty: Type, reg: Register) !void {
     }
 }
 
-fn intrinsicsAllowed(self: *Self, ty: Type) bool {
+fn intrinsicsAllowed(target: Target, ty: Type) bool {
     return switch (ty.tag()) {
         .f32,
         .f64,
-        => Target.x86.featureSetHasAny(self.target.cpu.features, .{ .avx, .avx2 }),
+        => Target.x86.featureSetHasAny(target.cpu.features, .{ .sse2, .avx, .avx2 }),
         else => unreachable, // TODO finish this off
     };
 }
+
+fn hasAvxSupport(target: Target) bool {
+    return Target.x86.featureSetHasAny(target.cpu.features, .{ .avx, .avx2 });
+}
src/arch/x86_64/Emit.zig
@@ -182,6 +182,16 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .interrupt => try emit.mirInterrupt(inst),
             .nop => try emit.mirNop(),
 
+            // SSE instructions
+            .mov_f64_sse => try emit.mirMovFloatSse(.movsd, inst),
+            .mov_f32_sse => try emit.mirMovFloatSse(.movss, inst),
+
+            .add_f64_sse => try emit.mirAddFloatSse(.addsd, inst),
+            .add_f32_sse => try emit.mirAddFloatSse(.addss, inst),
+
+            .cmp_f64_sse => try emit.mirCmpFloatSse(.ucomisd, inst),
+            .cmp_f32_sse => try emit.mirCmpFloatSse(.ucomiss, inst),
+
             // AVX instructions
             .mov_f64_avx => try emit.mirMovFloatAvx(.vmovsd, inst),
             .mov_f32_avx => try emit.mirMovFloatAvx(.vmovss, inst),
@@ -536,6 +546,7 @@ fn mirArithMemImm(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
 }
 
 inline fn setRexWRegister(reg: Register) bool {
+    if (reg.size() > 64) return false;
     if (reg.size() == 64) return true;
     return switch (reg) {
         .ah, .ch, .dh, .bh => true,
@@ -963,11 +974,55 @@ fn mirLeaPie(emit: *Emit, inst: Mir.Inst.Index) InnerError!void {
     }
 }
 
+// SSE instructions
+
+fn mirMovFloatSse(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
+    const ops = emit.mir.instructions.items(.ops)[inst].decode();
+    switch (ops.flags) {
+        0b00 => {
+            const imm = emit.mir.instructions.items(.data)[inst].imm;
+            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.mem(Memory.PtrSize.new(ops.reg2.size()), .{
+                .disp = imm,
+                .base = ops.reg2,
+            }), emit.code);
+        },
+        0b01 => {
+            const imm = emit.mir.instructions.items(.data)[inst].imm;
+            return lowerToMrEnc(tag, RegisterOrMemory.mem(Memory.PtrSize.new(ops.reg1.size()), .{
+                .disp = imm,
+                .base = ops.reg1,
+            }), ops.reg2, emit.code);
+        },
+        0b10 => {
+            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
+        },
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
+    }
+}
+
+fn mirAddFloatSse(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
+    const ops = emit.mir.instructions.items(.ops)[inst].decode();
+    switch (ops.flags) {
+        0b00 => {
+            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
+        },
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
+    }
+}
+
+fn mirCmpFloatSse(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
+    const ops = emit.mir.instructions.items(.ops)[inst].decode();
+    switch (ops.flags) {
+        0b00 => {
+            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
+        },
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
+    }
+}
 // AVX instructions
 
 fn mirMovFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
     const ops = emit.mir.instructions.items(.ops)[inst].decode();
-
     switch (ops.flags) {
         0b00 => {
             const imm = emit.mir.instructions.items(.data)[inst].imm;
@@ -986,24 +1041,22 @@ fn mirMovFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
         0b10 => {
             return lowerToRvmEnc(tag, ops.reg1, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
         },
-        else => return emit.fail("TODO unused variant 0b{b} for mov_f64", .{ops.flags}),
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
     }
 }
 
 fn mirAddFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
     const ops = emit.mir.instructions.items(.ops)[inst].decode();
-
     switch (ops.flags) {
         0b00 => {
             return lowerToRvmEnc(tag, ops.reg1, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
         },
-        else => return emit.fail("TODO unused variant 0b{b} for mov_f64", .{ops.flags}),
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
     }
 }
 
 fn mirCmpFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
     const ops = emit.mir.instructions.items(.ops)[inst].decode();
-
     switch (ops.flags) {
         0b00 => {
             return lowerToVmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
@@ -1247,6 +1300,14 @@ const Tag = enum {
     cmovng,
     cmovb,
     cmovnae,
+    movsd,
+    movss,
+    addsd,
+    addss,
+    cmpsd,
+    cmpss,
+    ucomisd,
+    ucomiss,
     vmovsd,
     vmovss,
     vaddsd,
@@ -1256,6 +1317,22 @@ const Tag = enum {
     vucomisd,
     vucomiss,
 
+    fn isSse(tag: Tag) bool {
+        return switch (tag) {
+            .movsd,
+            .movss,
+            .addsd,
+            .addss,
+            .cmpsd,
+            .cmpss,
+            .ucomisd,
+            .ucomiss,
+            => true,
+
+            else => false,
+        };
+    }
+
     fn isAvx(tag: Tag) bool {
         return switch (tag) {
             .vmovsd,
@@ -1369,190 +1446,256 @@ const Encoding = enum {
     rvmi,
 };
 
-const OpCode = union(enum) {
-    one_byte: u8,
-    two_byte: struct { _1: u8, _2: u8 },
-
-    fn oneByte(opc: u8) OpCode {
-        return .{ .one_byte = opc };
-    }
+const OpCode = struct {
+    bytes: [3]u8,
+    count: usize,
 
-    fn twoByte(opc1: u8, opc2: u8) OpCode {
-        return .{ .two_byte = .{ ._1 = opc1, ._2 = opc2 } };
+    fn init(comptime in_bytes: []const u8) OpCode {
+        comptime assert(in_bytes.len <= 3);
+        comptime var bytes: [3]u8 = undefined;
+        inline for (in_bytes) |x, i| {
+            bytes[i] = x;
+        }
+        return .{ .bytes = bytes, .count = in_bytes.len };
     }
 
     fn encode(opc: OpCode, encoder: Encoder) void {
-        switch (opc) {
-            .one_byte => |v| encoder.opcode_1byte(v),
-            .two_byte => |v| encoder.opcode_2byte(v._1, v._2),
+        switch (opc.count) {
+            1 => encoder.opcode_1byte(opc.bytes[0]),
+            2 => encoder.opcode_2byte(opc.bytes[0], opc.bytes[1]),
+            3 => encoder.opcode_3byte(opc.bytes[0], opc.bytes[1], opc.bytes[2]),
+            else => unreachable,
         }
     }
 
     fn encodeWithReg(opc: OpCode, encoder: Encoder, reg: Register) void {
-        assert(opc == .one_byte);
-        encoder.opcode_withReg(opc.one_byte, reg.lowEnc());
+        assert(opc.count == 1);
+        encoder.opcode_withReg(opc.bytes[0], reg.lowEnc());
     }
 };
 
 inline fn getOpCode(tag: Tag, enc: Encoding, is_one_byte: bool) OpCode {
+    // zig fmt: off
     switch (enc) {
         .zo => return switch (tag) {
-            .ret_near => OpCode.oneByte(0xc3),
-            .ret_far => OpCode.oneByte(0xcb),
-            .int3 => OpCode.oneByte(0xcc),
-            .nop => OpCode.oneByte(0x90),
-            .syscall => OpCode.twoByte(0x0f, 0x05),
-            .cbw => OpCode.oneByte(0x98),
-            .cwd, .cdq, .cqo => OpCode.oneByte(0x99),
-            else => unreachable,
+            .ret_near => OpCode.init(&.{0xc3}),
+            .ret_far  => OpCode.init(&.{0xcb}),
+            .int3     => OpCode.init(&.{0xcc}),
+            .nop      => OpCode.init(&.{0x90}),
+            .syscall  => OpCode.init(&.{ 0x0f, 0x05 }),
+            .cbw      => OpCode.init(&.{0x98}),
+            .cwd,
+            .cdq,
+            .cqo      => OpCode.init(&.{0x99}),
+            else      => unreachable,
         },
         .d => return switch (tag) {
-            .jmp_near => OpCode.oneByte(0xe9),
-            .call_near => OpCode.oneByte(0xe8),
-            .jo => if (is_one_byte) OpCode.oneByte(0x70) else OpCode.twoByte(0x0f, 0x80),
-            .jno => if (is_one_byte) OpCode.oneByte(0x71) else OpCode.twoByte(0x0f, 0x81),
-            .jb, .jc, .jnae => if (is_one_byte) OpCode.oneByte(0x72) else OpCode.twoByte(0x0f, 0x82),
-            .jnb, .jnc, .jae => if (is_one_byte) OpCode.oneByte(0x73) else OpCode.twoByte(0x0f, 0x83),
-            .je, .jz => if (is_one_byte) OpCode.oneByte(0x74) else OpCode.twoByte(0x0f, 0x84),
-            .jne, .jnz => if (is_one_byte) OpCode.oneByte(0x75) else OpCode.twoByte(0x0f, 0x85),
-            .jna, .jbe => if (is_one_byte) OpCode.oneByte(0x76) else OpCode.twoByte(0x0f, 0x86),
-            .jnbe, .ja => if (is_one_byte) OpCode.oneByte(0x77) else OpCode.twoByte(0x0f, 0x87),
-            .js => if (is_one_byte) OpCode.oneByte(0x78) else OpCode.twoByte(0x0f, 0x88),
-            .jns => if (is_one_byte) OpCode.oneByte(0x79) else OpCode.twoByte(0x0f, 0x89),
-            .jpe, .jp => if (is_one_byte) OpCode.oneByte(0x7a) else OpCode.twoByte(0x0f, 0x8a),
-            .jpo, .jnp => if (is_one_byte) OpCode.oneByte(0x7b) else OpCode.twoByte(0x0f, 0x8b),
-            .jnge, .jl => if (is_one_byte) OpCode.oneByte(0x7c) else OpCode.twoByte(0x0f, 0x8c),
-            .jge, .jnl => if (is_one_byte) OpCode.oneByte(0x7d) else OpCode.twoByte(0x0f, 0x8d),
-            .jle, .jng => if (is_one_byte) OpCode.oneByte(0x7e) else OpCode.twoByte(0x0f, 0x8e),
-            .jg, .jnle => if (is_one_byte) OpCode.oneByte(0x7f) else OpCode.twoByte(0x0f, 0x8f),
-            else => unreachable,
+            .jmp_near  =>                  OpCode.init(&.{0xe9}),
+            .call_near =>                  OpCode.init(&.{0xe8}),
+            .jo        => if (is_one_byte) OpCode.init(&.{0x70}) else OpCode.init(&.{0x0f,0x80}),
+            .jno       => if (is_one_byte) OpCode.init(&.{0x71}) else OpCode.init(&.{0x0f,0x81}),
+            .jb,
+            .jc,
+            .jnae      => if (is_one_byte) OpCode.init(&.{0x72}) else OpCode.init(&.{0x0f,0x82}),
+            .jnb,
+            .jnc, 
+            .jae       => if (is_one_byte) OpCode.init(&.{0x73}) else OpCode.init(&.{0x0f,0x83}),
+            .je, 
+            .jz        => if (is_one_byte) OpCode.init(&.{0x74}) else OpCode.init(&.{0x0f,0x84}),
+            .jne, 
+            .jnz       => if (is_one_byte) OpCode.init(&.{0x75}) else OpCode.init(&.{0x0f,0x85}),
+            .jna, 
+            .jbe       => if (is_one_byte) OpCode.init(&.{0x76}) else OpCode.init(&.{0x0f,0x86}),
+            .jnbe, 
+            .ja        => if (is_one_byte) OpCode.init(&.{0x77}) else OpCode.init(&.{0x0f,0x87}),
+            .js        => if (is_one_byte) OpCode.init(&.{0x78}) else OpCode.init(&.{0x0f,0x88}),
+            .jns       => if (is_one_byte) OpCode.init(&.{0x79}) else OpCode.init(&.{0x0f,0x89}),
+            .jpe, 
+            .jp        => if (is_one_byte) OpCode.init(&.{0x7a}) else OpCode.init(&.{0x0f,0x8a}),
+            .jpo, 
+            .jnp       => if (is_one_byte) OpCode.init(&.{0x7b}) else OpCode.init(&.{0x0f,0x8b}),
+            .jnge, 
+            .jl        => if (is_one_byte) OpCode.init(&.{0x7c}) else OpCode.init(&.{0x0f,0x8c}),
+            .jge, 
+            .jnl       => if (is_one_byte) OpCode.init(&.{0x7d}) else OpCode.init(&.{0x0f,0x8d}),
+            .jle, 
+            .jng       => if (is_one_byte) OpCode.init(&.{0x7e}) else OpCode.init(&.{0x0f,0x8e}),
+            .jg, 
+            .jnle      => if (is_one_byte) OpCode.init(&.{0x7f}) else OpCode.init(&.{0x0f,0x8f}),
+            else       => unreachable,
         },
         .m => return switch (tag) {
-            .jmp_near, .call_near, .push => OpCode.oneByte(0xff),
-            .pop => OpCode.oneByte(0x8f),
-            .seto => OpCode.twoByte(0x0f, 0x90),
-            .setno => OpCode.twoByte(0x0f, 0x91),
-            .setb, .setc, .setnae => OpCode.twoByte(0x0f, 0x92),
-            .setnb, .setnc, .setae => OpCode.twoByte(0x0f, 0x93),
-            .sete, .setz => OpCode.twoByte(0x0f, 0x94),
-            .setne, .setnz => OpCode.twoByte(0x0f, 0x95),
-            .setbe, .setna => OpCode.twoByte(0x0f, 0x96),
-            .seta, .setnbe => OpCode.twoByte(0x0f, 0x97),
-            .sets => OpCode.twoByte(0x0f, 0x98),
-            .setns => OpCode.twoByte(0x0f, 0x99),
-            .setp, .setpe => OpCode.twoByte(0x0f, 0x9a),
-            .setnp, .setop => OpCode.twoByte(0x0f, 0x9b),
-            .setl, .setnge => OpCode.twoByte(0x0f, 0x9c),
-            .setnl, .setge => OpCode.twoByte(0x0f, 0x9d),
-            .setle, .setng => OpCode.twoByte(0x0f, 0x9e),
-            .setnle, .setg => OpCode.twoByte(0x0f, 0x9f),
-            .idiv, .div, .imul, .mul => OpCode.oneByte(if (is_one_byte) 0xf6 else 0xf7),
-            .fisttp16 => OpCode.oneByte(0xdf),
-            .fisttp32 => OpCode.oneByte(0xdb),
-            .fisttp64 => OpCode.oneByte(0xdd),
-            .fld32 => OpCode.oneByte(0xd9),
-            .fld64 => OpCode.oneByte(0xdd),
-            else => unreachable,
+            .jmp_near,
+            .call_near,
+            .push       =>                  OpCode.init(&.{0xff}),
+            .pop        =>                  OpCode.init(&.{0x8f}),
+            .seto       =>                  OpCode.init(&.{0x0f,0x90}),
+            .setno      =>                  OpCode.init(&.{0x0f,0x91}),
+            .setb,
+            .setc,
+            .setnae     =>                  OpCode.init(&.{0x0f,0x92}),
+            .setnb,
+            .setnc,
+            .setae      =>                  OpCode.init(&.{0x0f,0x93}),
+            .sete,
+            .setz       =>                  OpCode.init(&.{0x0f,0x94}),
+            .setne,
+            .setnz      =>                  OpCode.init(&.{0x0f,0x95}),
+            .setbe,
+            .setna      =>                  OpCode.init(&.{0x0f,0x96}),
+            .seta,
+            .setnbe     =>                  OpCode.init(&.{0x0f,0x97}),
+            .sets       =>                  OpCode.init(&.{0x0f,0x98}),
+            .setns      =>                  OpCode.init(&.{0x0f,0x99}),
+            .setp,
+            .setpe      =>                  OpCode.init(&.{0x0f,0x9a}),
+            .setnp, 
+            .setop      =>                  OpCode.init(&.{0x0f,0x9b}),
+            .setl, 
+            .setnge     =>                  OpCode.init(&.{0x0f,0x9c}),
+            .setnl,
+            .setge      =>                  OpCode.init(&.{0x0f,0x9d}),
+            .setle,
+            .setng      =>                  OpCode.init(&.{0x0f,0x9e}),
+            .setnle,
+            .setg       =>                  OpCode.init(&.{0x0f,0x9f}),
+            .idiv,
+            .div,
+            .imul,
+            .mul        => if (is_one_byte) OpCode.init(&.{0xf6}) else OpCode.init(&.{0xf7}),
+            .fisttp16   =>                  OpCode.init(&.{0xdf}),
+            .fisttp32   =>                  OpCode.init(&.{0xdb}),
+            .fisttp64   =>                  OpCode.init(&.{0xdd}),
+            .fld32      =>                  OpCode.init(&.{0xd9}),
+            .fld64      =>                  OpCode.init(&.{0xdd}),
+            else        => unreachable,
         },
         .o => return switch (tag) {
-            .push => OpCode.oneByte(0x50),
-            .pop => OpCode.oneByte(0x58),
-            else => unreachable,
+            .push => OpCode.init(&.{0x50}),
+            .pop  => OpCode.init(&.{0x58}),
+            else  => unreachable,
         },
         .i => return switch (tag) {
-            .push => OpCode.oneByte(if (is_one_byte) 0x6a else 0x68),
-            .@"test" => OpCode.oneByte(if (is_one_byte) 0xa8 else 0xa9),
-            .ret_near => OpCode.oneByte(0xc2),
-            .ret_far => OpCode.oneByte(0xca),
-            else => unreachable,
+            .push     => if (is_one_byte) OpCode.init(&.{0x6a}) else OpCode.init(&.{0x68}),
+            .@"test"  => if (is_one_byte) OpCode.init(&.{0xa8}) else OpCode.init(&.{0xa9}),
+            .ret_near => OpCode.init(&.{0xc2}),
+            .ret_far  => OpCode.init(&.{0xca}),
+            else      => unreachable,
         },
         .m1 => return switch (tag) {
-            .shl, .sal, .shr, .sar => OpCode.oneByte(if (is_one_byte) 0xd0 else 0xd1),
-            else => unreachable,
+            .shl, .sal,
+            .shr, .sar  => if (is_one_byte) OpCode.init(&.{0xd0}) else OpCode.init(&.{0xd1}),
+            else        => unreachable,
         },
         .mc => return switch (tag) {
-            .shl, .sal, .shr, .sar => OpCode.oneByte(if (is_one_byte) 0xd2 else 0xd3),
-            else => unreachable,
+            .shl, .sal,
+            .shr, .sar  => if (is_one_byte) OpCode.init(&.{0xd2}) else OpCode.init(&.{0xd3}),
+            else        => unreachable,
         },
         .mi => return switch (tag) {
-            .adc, .add, .sub, .xor, .@"and", .@"or", .sbb, .cmp => OpCode.oneByte(if (is_one_byte) 0x80 else 0x81),
-            .mov => OpCode.oneByte(if (is_one_byte) 0xc6 else 0xc7),
-            .@"test" => OpCode.oneByte(if (is_one_byte) 0xf6 else 0xf7),
-            else => unreachable,
+            .adc, .add,
+            .sub, .xor,
+            .@"and", .@"or",
+            .sbb, .cmp       => if (is_one_byte) OpCode.init(&.{0x80}) else OpCode.init(&.{0x81}),
+            .mov             => if (is_one_byte) OpCode.init(&.{0xc6}) else OpCode.init(&.{0xc7}),
+            .@"test"         => if (is_one_byte) OpCode.init(&.{0xf6}) else OpCode.init(&.{0xf7}),
+            else             => unreachable,
         },
         .mi8 => return switch (tag) {
-            .adc, .add, .sub, .xor, .@"and", .@"or", .sbb, .cmp => OpCode.oneByte(0x83),
-            .shl, .sal, .shr, .sar => OpCode.oneByte(if (is_one_byte) 0xc0 else 0xc1),
-            else => unreachable,
+            .adc, .add,
+            .sub, .xor,
+            .@"and", .@"or",
+            .sbb, .cmp        =>                  OpCode.init(&.{0x83}),
+            .shl, .sal,
+            .shr, .sar        => if (is_one_byte) OpCode.init(&.{0xc0}) else OpCode.init(&.{0xc1}),
+            else              => unreachable,
         },
         .mr => return switch (tag) {
-            .adc => OpCode.oneByte(if (is_one_byte) 0x10 else 0x11),
-            .add => OpCode.oneByte(if (is_one_byte) 0x00 else 0x01),
-            .sub => OpCode.oneByte(if (is_one_byte) 0x28 else 0x29),
-            .xor => OpCode.oneByte(if (is_one_byte) 0x30 else 0x31),
-            .@"and" => OpCode.oneByte(if (is_one_byte) 0x20 else 0x21),
-            .@"or" => OpCode.oneByte(if (is_one_byte) 0x08 else 0x09),
-            .sbb => OpCode.oneByte(if (is_one_byte) 0x18 else 0x19),
-            .cmp => OpCode.oneByte(if (is_one_byte) 0x38 else 0x39),
-            .mov => OpCode.oneByte(if (is_one_byte) 0x88 else 0x89),
-            .@"test" => OpCode.oneByte(if (is_one_byte) 0x84 else 0x85),
-            else => unreachable,
+            .adc     => if (is_one_byte) OpCode.init(&.{0x10}) else OpCode.init(&.{0x11}),
+            .add     => if (is_one_byte) OpCode.init(&.{0x00}) else OpCode.init(&.{0x01}),
+            .sub     => if (is_one_byte) OpCode.init(&.{0x28}) else OpCode.init(&.{0x29}),
+            .xor     => if (is_one_byte) OpCode.init(&.{0x30}) else OpCode.init(&.{0x31}),
+            .@"and"  => if (is_one_byte) OpCode.init(&.{0x20}) else OpCode.init(&.{0x21}),
+            .@"or"   => if (is_one_byte) OpCode.init(&.{0x08}) else OpCode.init(&.{0x09}),
+            .sbb     => if (is_one_byte) OpCode.init(&.{0x18}) else OpCode.init(&.{0x19}),
+            .cmp     => if (is_one_byte) OpCode.init(&.{0x38}) else OpCode.init(&.{0x39}),
+            .mov     => if (is_one_byte) OpCode.init(&.{0x88}) else OpCode.init(&.{0x89}),
+            .@"test" => if (is_one_byte) OpCode.init(&.{0x84}) else OpCode.init(&.{0x85}),
+            .movsd   =>                  OpCode.init(&.{0xf2,0x0f,0x11}),
+            .movss   =>                  OpCode.init(&.{0xf3,0x0f,0x11}),
+            else     => unreachable,
         },
         .rm => return switch (tag) {
-            .adc => OpCode.oneByte(if (is_one_byte) 0x12 else 0x13),
-            .add => OpCode.oneByte(if (is_one_byte) 0x02 else 0x03),
-            .sub => OpCode.oneByte(if (is_one_byte) 0x2a else 0x2b),
-            .xor => OpCode.oneByte(if (is_one_byte) 0x32 else 0x33),
-            .@"and" => OpCode.oneByte(if (is_one_byte) 0x22 else 0x23),
-            .@"or" => OpCode.oneByte(if (is_one_byte) 0x0a else 0x0b),
-            .sbb => OpCode.oneByte(if (is_one_byte) 0x1a else 0x1b),
-            .cmp => OpCode.oneByte(if (is_one_byte) 0x3a else 0x3b),
-            .mov => OpCode.oneByte(if (is_one_byte) 0x8a else 0x8b),
-            .movsx => OpCode.twoByte(0x0f, if (is_one_byte) 0xbe else 0xbf),
-            .movsxd => OpCode.oneByte(0x63),
-            .movzx => OpCode.twoByte(0x0f, if (is_one_byte) 0xb6 else 0xb7),
-            .lea => OpCode.oneByte(if (is_one_byte) 0x8c else 0x8d),
-            .imul => OpCode.twoByte(0x0f, 0xaf),
-            .cmove, .cmovz => OpCode.twoByte(0x0f, 0x44),
-            .cmovb, .cmovnae => OpCode.twoByte(0x0f, 0x42),
-            .cmovl, .cmovng => OpCode.twoByte(0x0f, 0x4c),
+            .adc      => if (is_one_byte) OpCode.init(&.{0x12})      else OpCode.init(&.{0x13}),
+            .add      => if (is_one_byte) OpCode.init(&.{0x02})      else OpCode.init(&.{0x03}),
+            .sub      => if (is_one_byte) OpCode.init(&.{0x2a})      else OpCode.init(&.{0x2b}),
+            .xor      => if (is_one_byte) OpCode.init(&.{0x32})      else OpCode.init(&.{0x33}),
+            .@"and"   => if (is_one_byte) OpCode.init(&.{0x22})      else OpCode.init(&.{0x23}),
+            .@"or"    => if (is_one_byte) OpCode.init(&.{0x0a})      else OpCode.init(&.{0x0b}),
+            .sbb      => if (is_one_byte) OpCode.init(&.{0x1a})      else OpCode.init(&.{0x1b}),
+            .cmp      => if (is_one_byte) OpCode.init(&.{0x3a})      else OpCode.init(&.{0x3b}),
+            .mov      => if (is_one_byte) OpCode.init(&.{0x8a})      else OpCode.init(&.{0x8b}),
+            .movsx    => if (is_one_byte) OpCode.init(&.{0x0f,0xbe}) else OpCode.init(&.{0x0f,0xbf}),
+            .movsxd   =>                  OpCode.init(&.{0x63}),
+            .movzx    => if (is_one_byte) OpCode.init(&.{0x0f,0xb6}) else OpCode.init(&.{0x0f,0xb7}),
+            .lea      => if (is_one_byte) OpCode.init(&.{0x8c})      else OpCode.init(&.{0x8d}),
+            .imul     =>                  OpCode.init(&.{0x0f,0xaf}),
+            .cmove, 
+            .cmovz    =>                  OpCode.init(&.{0x0f,0x44}),
+            .cmovb,
+            .cmovnae  =>                  OpCode.init(&.{0x0f,0x42}),
+            .cmovl,
+            .cmovng   =>                  OpCode.init(&.{0x0f,0x4c}),
+            .movsd    =>                  OpCode.init(&.{0xf2,0x0f,0x10}),
+            .movss    =>                  OpCode.init(&.{0xf3,0x0f,0x10}),
+            .addsd    =>                  OpCode.init(&.{0xf2,0x0f,0x58}),
+            .addss    =>                  OpCode.init(&.{0xf3,0x0f,0x58}),
+            .ucomisd  =>                  OpCode.init(&.{0x66,0x0f,0x2e}),
+            .ucomiss  =>                  OpCode.init(&.{0x0f,0x2e}),
             else => unreachable,
         },
         .oi => return switch (tag) {
-            .mov => OpCode.oneByte(if (is_one_byte) 0xb0 else 0xb8),
+            .mov => if (is_one_byte) OpCode.init(&.{0xb0}) else OpCode.init(&.{0xb8}),
             else => unreachable,
         },
         .fd => return switch (tag) {
-            .mov => OpCode.oneByte(if (is_one_byte) 0xa0 else 0xa1),
+            .mov => if (is_one_byte) OpCode.init(&.{0xa0}) else OpCode.init(&.{0xa1}),
             else => unreachable,
         },
         .td => return switch (tag) {
-            .mov => OpCode.oneByte(if (is_one_byte) 0xa2 else 0xa3),
+            .mov => if (is_one_byte) OpCode.init(&.{0xa2}) else OpCode.init(&.{0xa3}),
             else => unreachable,
         },
         .rmi => return switch (tag) {
-            .imul => OpCode.oneByte(if (is_one_byte) 0x6b else 0x69),
-            else => unreachable,
+            .imul => if (is_one_byte) OpCode.init(&.{0x6b}) else OpCode.init(&.{0x69}),
+            else  => unreachable,
         },
         .mv => return switch (tag) {
-            .vmovsd, .vmovss => OpCode.oneByte(0x11),
+            .vmovsd,
+            .vmovss => OpCode.init(&.{0x11}),
             else => unreachable,
         },
         .vm => return switch (tag) {
-            .vmovsd, .vmovss => OpCode.oneByte(0x10),
-            .vucomisd, .vucomiss => OpCode.oneByte(0x2e),
+            .vmovsd, 
+            .vmovss   => OpCode.init(&.{0x10}),
+            .vucomisd,
+            .vucomiss => OpCode.init(&.{0x2e}),
             else => unreachable,
         },
         .rvm => return switch (tag) {
-            .vaddsd, .vaddss => OpCode.oneByte(0x58),
-            .vmovsd, .vmovss => OpCode.oneByte(0x10),
+            .vaddsd,
+            .vaddss  => OpCode.init(&.{0x58}),
+            .vmovsd,
+            .vmovss  => OpCode.init(&.{0x10}),
             else => unreachable,
         },
         .rvmi => return switch (tag) {
-            .vcmpsd, .vcmpss => OpCode.oneByte(0xc2),
-            else => unreachable,
+            .vcmpsd,
+            .vcmpss  => OpCode.init(&.{0xc2}),
+            else     => unreachable,
         },
     }
+    // zig fmt: on
 }
 
 inline fn getModRmExt(tag: Tag) u3 {
src/arch/x86_64/Mir.zig
@@ -345,11 +345,29 @@ pub const Inst = struct {
         /// Nop
         nop,
 
-        /// AVX instructions
+        /// SSE instructions
         /// ops flags:  form:
         ///       0b00  reg1, qword ptr [reg2 + imm32]
         ///       0b01  qword ptr [reg1 + imm32], reg2
         ///       0b10  reg1, reg2
+        mov_f64_sse,
+        mov_f32_sse,
+
+        /// ops flags:  form:
+        ///       0b00  reg1, reg2
+        add_f64_sse,
+        add_f32_sse,
+
+        /// ops flags:  form:
+        ///       0b00  reg1, reg2
+        cmp_f64_sse,
+        cmp_f32_sse,
+
+        /// AVX instructions
+        /// ops flags:  form:
+        ///       0b00  reg1, qword ptr [reg2 + imm32]
+        ///       0b01  qword ptr [reg1 + imm32], reg2
+        ///       0b10  reg1, reg1, reg2
         mov_f64_avx,
         mov_f32_avx,
 
@@ -359,7 +377,7 @@ pub const Inst = struct {
         add_f32_avx,
 
         /// ops flags:  form:
-        ///
+        ///       0b00  reg1, reg1, reg2
         cmp_f64_avx,
         cmp_f32_avx,