Commit edd63f9aba

Jacob Young <jacobly0@users.noreply.github.com>
2023-03-18 10:49:20
x86_64: reimplement inline memcpy and memset
1 parent c865c8f
src/arch/x86_64/bits.zig
@@ -6,6 +6,9 @@ const Allocator = std.mem.Allocator;
 const ArrayList = std.ArrayList;
 const DW = std.dwarf;
 
+pub const StringRepeat = enum(u3) { none, rep, repe, repz, repne, repnz };
+pub const StringWidth = enum(u2) { b, w, d, q };
+
 /// EFLAGS condition codes
 pub const Condition = enum(u5) {
     /// above
src/arch/x86_64/CodeGen.zig
@@ -1286,7 +1286,7 @@ pub fn spillEflagsIfOccupied(self: *Self) !void {
     }
 }
 
-pub fn spillRegisters(self: *Self, comptime count: comptime_int, registers: [count]Register) !void {
+pub fn spillRegisters(self: *Self, registers: []const Register) !void {
     for (registers) |reg| {
         try self.register_manager.getReg(reg, null);
     }
@@ -1540,7 +1540,7 @@ fn airMulDivBinOp(self: *Self, inst: Air.Inst.Index) !void {
             break :result try self.genBinOp(inst, tag, bin_op.lhs, bin_op.rhs);
         }
 
-        try self.spillRegisters(2, .{ .rax, .rdx });
+        try self.spillRegisters(&.{ .rax, .rdx });
 
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
@@ -1594,7 +1594,7 @@ fn airAddSubShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                 try self.spillEflagsIfOccupied();
 
                 if (tag == .shl_with_overflow) {
-                    try self.spillRegisters(1, .{.rcx});
+                    try self.spillRegisters(&.{.rcx});
                 }
 
                 const partial: MCValue = switch (tag) {
@@ -1721,7 +1721,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                     try self.spillEflagsIfOccupied();
                     self.eflags_inst = inst;
 
-                    try self.spillRegisters(2, .{ .rax, .rdx });
+                    try self.spillRegisters(&.{ .rax, .rdx });
 
                     const lhs = try self.resolveInst(bin_op.lhs);
                     const rhs = try self.resolveInst(bin_op.rhs);
@@ -1774,7 +1774,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
                             break :dst_reg dst_reg;
                         },
                         .unsigned => {
-                            try self.spillRegisters(2, .{ .rax, .rdx });
+                            try self.spillRegisters(&.{ .rax, .rdx });
 
                             const lhs = try self.resolveInst(bin_op.lhs);
                             const rhs = try self.resolveInst(bin_op.rhs);
@@ -1888,7 +1888,7 @@ fn airShlShrBinOp(self: *Self, inst: Air.Inst.Index) !void {
         return self.finishAir(inst, .dead, .{ bin_op.lhs, bin_op.rhs, .none });
     }
 
-    try self.spillRegisters(1, .{.rcx});
+    try self.spillRegisters(&.{.rcx});
 
     const tag = self.air.instructions.items(.tag)[inst];
     const lhs = try self.resolveInst(bin_op.lhs);
@@ -2832,6 +2832,7 @@ fn store(self: *Self, ptr: MCValue, value: MCValue, ptr_ty: Type, value_ty: Type
                 .unreach => unreachable,
                 .eflags => unreachable,
                 .undef => {
+                    if (!self.wantSafety()) return; // The already existing value will do just fine.
                     switch (abi_size) {
                         1 => try self.store(ptr, .{ .immediate = 0xaa }, ptr_ty, value_ty),
                         2 => try self.store(ptr, .{ .immediate = 0xaaaa }, ptr_ty, value_ty),
@@ -4035,11 +4036,40 @@ fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier
     defer info.deinit(self);
 
     try self.spillEflagsIfOccupied();
+    try self.spillRegisters(abi.getCallerPreservedRegs(self.target.*));
 
-    for (abi.getCallerPreservedRegs(self.target.*)) |reg| {
-        try self.register_manager.getReg(reg, null);
+    // set stack arguments first because this can clobber registers
+    // also clobber spill arguments as we go
+    if (info.return_value == .stack_offset) {
+        try self.spillRegisters(&.{abi.getCAbiIntParamRegs(self.target.*)[0]});
+    }
+    for (args, info.args) |arg, mc_arg| {
+        const arg_ty = self.air.typeOf(arg);
+        const arg_mcv = try self.resolveInst(arg);
+        // Here we do not use setRegOrMem even though the logic is similar, because
+        // the function call will move the stack pointer, so the offsets are different.
+        switch (mc_arg) {
+            .none => {},
+            .register => |reg| try self.spillRegisters(&.{reg}),
+            .stack_offset => |off| {
+                // TODO rewrite using `genSetStack`
+                try self.genSetStackArg(arg_ty, off, arg_mcv);
+            },
+            .ptr_stack_offset => {
+                return self.fail("TODO implement calling with MCValue.ptr_stack_offset arg", .{});
+            },
+            .undef => unreachable,
+            .immediate => unreachable,
+            .unreach => unreachable,
+            .dead => unreachable,
+            .memory => unreachable,
+            .linker_load => unreachable,
+            .eflags => unreachable,
+            .register_overflow => unreachable,
+        }
     }
 
+    // now we are free to set register arguments
     const ret_reg_lock: ?RegisterLock = blk: {
         if (info.return_value == .stack_offset) {
             const ret_ty = fn_ty.fnReturnType();
@@ -4049,7 +4079,6 @@ fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier
             log.debug("airCall: return value on stack at offset {}", .{stack_offset});
 
             const ret_reg = abi.getCAbiIntParamRegs(self.target.*)[0];
-            try self.register_manager.getReg(ret_reg, null);
             try self.genSetReg(Type.usize, ret_reg, .{ .ptr_stack_offset = stack_offset });
             const ret_reg_lock = self.register_manager.lockRegAssumeUnused(ret_reg);
 
@@ -4061,25 +4090,12 @@ fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier
     };
     defer if (ret_reg_lock) |lock| self.register_manager.unlockReg(lock);
 
-    for (args, info.args) |arg, info_arg| {
-        const mc_arg = info_arg;
+    for (args, info.args) |arg, mc_arg| {
         const arg_ty = self.air.typeOf(arg);
         const arg_mcv = try self.resolveInst(arg);
-        // Here we do not use setRegOrMem even though the logic is similar, because
-        // the function call will move the stack pointer, so the offsets are different.
         switch (mc_arg) {
-            .none => continue,
-            .register => |reg| {
-                try self.register_manager.getReg(reg, null);
-                try self.genSetReg(arg_ty, reg, arg_mcv);
-            },
-            .stack_offset => |off| {
-                // TODO rewrite using `genSetStack`
-                try self.genSetStackArg(arg_ty, off, arg_mcv);
-            },
-            .ptr_stack_offset => {
-                return self.fail("TODO implement calling with MCValue.ptr_stack_offset arg", .{});
-            },
+            .none, .stack_offset, .ptr_stack_offset => {},
+            .register => |reg| try self.genSetReg(arg_ty, reg, arg_mcv),
             .undef => unreachable,
             .immediate => unreachable,
             .unreach => unreachable,
@@ -5277,6 +5293,7 @@ fn genSetStackArg(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerE
         .dead => unreachable,
         .unreach, .none => return,
         .undef => {
+            if (!self.wantSafety()) return; // The already existing value will do just fine.
             if (abi_size <= 8) {
                 const reg = try self.copyToTmpRegister(ty, mcv);
                 return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
@@ -5384,8 +5401,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue, opts: Inl
         .dead => unreachable,
         .unreach, .none => return, // Nothing to do.
         .undef => {
-            if (!self.wantSafety())
-                return; // The already existing value will do just fine.
+            if (!self.wantSafety()) return; // The already existing value will do just fine.
             // TODO Upgrade this to a memset call when we have that available.
             switch (abi_size) {
                 1, 2, 4 => {
@@ -5607,19 +5623,14 @@ fn genInlineMemcpy(
         null;
     defer if (dsbase_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const regs = try self.register_manager.allocRegs(5, .{ null, null, null, null, null }, gp);
-    const dst_addr_reg = regs[0];
-    const src_addr_reg = regs[1];
-    const index_reg = regs[2].to64();
-    const count_reg = regs[3].to64();
-    const tmp_reg = regs[4].to8();
+    try self.spillRegisters(&.{ .rdi, .rsi, .rcx });
 
     switch (dst_ptr) {
         .memory, .linker_load => {
-            try self.loadMemPtrIntoRegister(dst_addr_reg, Type.usize, dst_ptr);
+            try self.loadMemPtrIntoRegister(.rdi, Type.usize, dst_ptr);
         },
         .ptr_stack_offset, .stack_offset => |off| {
-            try self.asmRegisterMemory(.lea, dst_addr_reg.to64(), Memory.sib(.qword, .{
+            try self.asmRegisterMemory(.lea, .rdi, Memory.sib(.qword, .{
                 .base = opts.dest_stack_base orelse .rbp,
                 .disp = -off,
             }));
@@ -5627,7 +5638,7 @@ fn genInlineMemcpy(
         .register => |reg| {
             try self.asmRegisterRegister(
                 .mov,
-                registerAlias(dst_addr_reg, @intCast(u32, @divExact(reg.bitSize(), 8))),
+                registerAlias(.rdi, @intCast(u32, @divExact(reg.bitSize(), 8))),
                 reg,
             );
         },
@@ -5638,10 +5649,10 @@ fn genInlineMemcpy(
 
     switch (src_ptr) {
         .memory, .linker_load => {
-            try self.loadMemPtrIntoRegister(src_addr_reg, Type.usize, src_ptr);
+            try self.loadMemPtrIntoRegister(.rsi, Type.usize, src_ptr);
         },
         .ptr_stack_offset, .stack_offset => |off| {
-            try self.asmRegisterMemory(.lea, src_addr_reg.to64(), Memory.sib(.qword, .{
+            try self.asmRegisterMemory(.lea, .rsi, Memory.sib(.qword, .{
                 .base = opts.source_stack_base orelse .rbp,
                 .disp = -off,
             }));
@@ -5649,7 +5660,7 @@ fn genInlineMemcpy(
         .register => |reg| {
             try self.asmRegisterRegister(
                 .mov,
-                registerAlias(src_addr_reg, @intCast(u32, @divExact(reg.bitSize(), 8))),
+                registerAlias(.rsi, @intCast(u32, @divExact(reg.bitSize(), 8))),
                 reg,
             );
         },
@@ -5658,37 +5669,12 @@ fn genInlineMemcpy(
         },
     }
 
-    try self.genSetReg(Type.usize, count_reg, len);
-    try self.asmRegisterImmediate(.mov, index_reg, Immediate.u(0));
-    const loop_start = try self.addInst(.{
-        .tag = .cmp,
-        .ops = .ri_u,
-        .data = .{ .ri = .{
-            .r1 = count_reg,
-            .imm = 0,
-        } },
+    try self.genSetReg(Type.usize, .rcx, len);
+    _ = try self.addInst(.{
+        .tag = .movs,
+        .ops = .string,
+        .data = .{ .string = .{ .repeat = .rep, .width = .b } },
     });
-    const loop_reloc = try self.asmJccReloc(undefined, .e);
-    try self.asmRegisterMemory(.mov, tmp_reg.to8(), Memory.sib(.byte, .{
-        .base = src_addr_reg,
-        .scale_index = .{
-            .scale = 1,
-            .index = index_reg,
-        },
-        .disp = 0,
-    }));
-    try self.asmMemoryRegister(.mov, Memory.sib(.byte, .{
-        .base = dst_addr_reg,
-        .scale_index = .{
-            .scale = 1,
-            .index = index_reg,
-        },
-        .disp = 0,
-    }), tmp_reg.to8());
-    try self.asmRegisterImmediate(.add, index_reg, Immediate.u(1));
-    try self.asmRegisterImmediate(.sub, count_reg, Immediate.u(1));
-    _ = try self.asmJmpReloc(loop_start);
-    try self.performReloc(loop_reloc);
 }
 
 fn genInlineMemset(
@@ -5698,28 +5684,20 @@ fn genInlineMemset(
     len: MCValue,
     opts: InlineMemcpyOpts,
 ) InnerError!void {
-    const ssbase_lock: ?RegisterLock = if (opts.source_stack_base) |reg|
-        self.register_manager.lockReg(reg)
-    else
-        null;
-    defer if (ssbase_lock) |reg| self.register_manager.unlockReg(reg);
-
     const dsbase_lock: ?RegisterLock = if (opts.dest_stack_base) |reg|
         self.register_manager.lockReg(reg)
     else
         null;
     defer if (dsbase_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const regs = try self.register_manager.allocRegs(2, .{ null, null }, gp);
-    const addr_reg = regs[0];
-    const index_reg = regs[1].to64();
+    try self.spillRegisters(&.{ .rdi, .al, .rcx });
 
     switch (dst_ptr) {
         .memory, .linker_load => {
-            try self.loadMemPtrIntoRegister(addr_reg, Type.usize, dst_ptr);
+            try self.loadMemPtrIntoRegister(.rdi, Type.usize, dst_ptr);
         },
         .ptr_stack_offset, .stack_offset => |off| {
-            try self.asmRegisterMemory(.lea, addr_reg.to64(), Memory.sib(.qword, .{
+            try self.asmRegisterMemory(.lea, .rdi, Memory.sib(.qword, .{
                 .base = opts.dest_stack_base orelse .rbp,
                 .disp = -off,
             }));
@@ -5727,48 +5705,22 @@ fn genInlineMemset(
         .register => |reg| {
             try self.asmRegisterRegister(
                 .mov,
-                registerAlias(addr_reg, @intCast(u32, @divExact(reg.bitSize(), 8))),
+                registerAlias(.rdi, @intCast(u32, @divExact(reg.bitSize(), 8))),
                 reg,
             );
         },
         else => {
-            return self.fail("TODO implement memcpy for setting stack when dest is {}", .{dst_ptr});
+            return self.fail("TODO implement memset for setting stack when dest is {}", .{dst_ptr});
         },
     }
 
-    try self.genSetReg(Type.usize, index_reg, len);
-    try self.genBinOpMir(.sub, Type.usize, .{ .register = index_reg }, .{ .immediate = 1 });
-
-    const loop_start = try self.addInst(.{
-        .tag = .cmp,
-        .ops = .ri_s,
-        .data = .{ .ri = .{
-            .r1 = index_reg,
-            .imm = @bitCast(u32, @as(i32, -1)),
-        } },
+    try self.genSetReg(Type.u8, .al, value);
+    try self.genSetReg(Type.usize, .rcx, len);
+    _ = try self.addInst(.{
+        .tag = .stos,
+        .ops = .string,
+        .data = .{ .string = .{ .repeat = .rep, .width = .b } },
     });
-    const loop_reloc = try self.asmJccReloc(undefined, .e);
-
-    switch (value) {
-        .immediate => |x| {
-            if (x > math.maxInt(i32)) {
-                return self.fail("TODO inline memset for value immediate larger than 32bits", .{});
-            }
-            try self.asmMemoryImmediate(.mov, Memory.sib(.byte, .{
-                .base = addr_reg,
-                .scale_index = .{
-                    .scale = 1,
-                    .index = index_reg,
-                },
-                .disp = 0,
-            }), Immediate.u(@intCast(u8, x)));
-        },
-        else => return self.fail("TODO inline memset for value of type {}", .{value}),
-    }
-
-    try self.asmRegisterImmediate(.sub, index_reg, Immediate.u(1));
-    _ = try self.asmJmpReloc(loop_start);
-    try self.performReloc(loop_reloc);
 }
 
 fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void {
@@ -5788,8 +5740,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
         },
         .unreach, .none => return, // Nothing to do.
         .undef => {
-            if (!self.wantSafety())
-                return; // The already existing value will do just fine.
+            if (!self.wantSafety()) return; // The already existing value will do just fine.
             // Write the debug undefined value.
             switch (registerAlias(reg, abi_size).bitSize()) {
                 8 => return self.genSetReg(ty, reg, .{ .immediate = 0xaa }),
@@ -5802,27 +5753,27 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
         .eflags => |cc| {
             return self.asmSetccRegister(reg.to8(), cc);
         },
-        .immediate => |x| {
-            if (x == 0) {
+        .immediate => |imm| {
+            if (imm == 0) {
                 // 32-bit moves zero-extend to 64-bit, so xoring the 32-bit
                 // register is the fastest way to zero a register.
-                return self.asmRegisterRegister(.xor, reg.to32(), reg.to32());
-            }
-            if (ty.isSignedInt()) {
-                const signed_x = @bitCast(i64, x);
-                if (math.minInt(i32) <= signed_x and signed_x <= math.maxInt(i32)) {
-                    return self.asmRegisterImmediate(
-                        .mov,
-                        registerAlias(reg, abi_size),
-                        Immediate.s(@intCast(i32, signed_x)),
-                    );
-                }
+                try self.asmRegisterRegister(.xor, reg.to32(), reg.to32());
+            } else if (abi_size > 4 and math.cast(u32, imm) != null) {
+                // 32-bit moves zero-extend to 64-bit.
+                try self.asmRegisterImmediate(.mov, reg.to32(), Immediate.u(imm));
+            } else if (abi_size <= 4 and @bitCast(i64, imm) < 0) {
+                try self.asmRegisterImmediate(
+                    .mov,
+                    registerAlias(reg, abi_size),
+                    Immediate.s(@intCast(i32, @bitCast(i64, imm))),
+                );
+            } else {
+                try self.asmRegisterImmediate(
+                    .mov,
+                    registerAlias(reg, abi_size),
+                    Immediate.u(imm),
+                );
             }
-            return self.asmRegisterImmediate(
-                .mov,
-                registerAlias(reg, abi_size),
-                Immediate.u(x),
-            );
         },
         .register => |src_reg| {
             // If the registers are the same, nothing to do.
@@ -6136,7 +6087,7 @@ fn airCmpxchg(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airAtomicRmw(self: *Self, inst: Air.Inst.Index) !void {
     _ = inst;
-    return self.fail("TODO implement x86 airAtomicRaw", .{});
+    return self.fail("TODO implement x86 airAtomicRmw", .{});
 }
 
 fn airAtomicLoad(self: *Self, inst: Air.Inst.Index) !void {
@@ -6177,7 +6128,7 @@ fn airMemset(self: *Self, inst: Air.Inst.Index) !void {
 
     try self.genInlineMemset(dst_ptr, src_val, len, .{});
 
-    return self.finishAir(inst, .none, .{ pl_op.operand, .none, .none });
+    return self.finishAir(inst, .none, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
 fn airMemcpy(self: *Self, inst: Air.Inst.Index) !void {
@@ -6229,7 +6180,7 @@ fn airMemcpy(self: *Self, inst: Air.Inst.Index) !void {
 
     try self.genInlineMemcpy(dst_ptr, src, len, .{});
 
-    return self.finishAir(inst, .none, .{ pl_op.operand, .none, .none });
+    return self.finishAir(inst, .none, .{ pl_op.operand, extra.lhs, extra.rhs });
 }
 
 fn airTagName(self: *Self, inst: Air.Inst.Index) !void {
src/arch/x86_64/Emit.zig
@@ -130,6 +130,13 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .ucomisd,
             => try emit.mirEncodeGeneric(tag, inst),
 
+            .cmps,
+            .lods,
+            .movs,
+            .scas,
+            .stos,
+            => try emit.mirString(tag, inst),
+
             .jmp_reloc => try emit.mirJmpReloc(inst),
 
             .call_extern => try emit.mirCallExtern(inst),
@@ -183,18 +190,8 @@ fn fixupRelocs(emit: *Emit) InnerError!void {
     }
 }
 
-fn encode(emit: *Emit, mnemonic: Instruction.Mnemonic, ops: struct {
-    op1: Instruction.Operand = .none,
-    op2: Instruction.Operand = .none,
-    op3: Instruction.Operand = .none,
-    op4: Instruction.Operand = .none,
-}) InnerError!void {
-    const inst = try Instruction.new(mnemonic, .{
-        .op1 = ops.op1,
-        .op2 = ops.op2,
-        .op3 = ops.op3,
-        .op4 = ops.op4,
-    });
+fn encode(emit: *Emit, mnemonic: Instruction.Mnemonic, ops: Instruction.Init) InnerError!void {
+    const inst = try Instruction.new(mnemonic, ops);
     return inst.encode(emit.code.writer());
 }
 
@@ -318,6 +315,28 @@ fn mirEncodeGeneric(emit: *Emit, tag: Mir.Inst.Tag, inst: Mir.Inst.Index) InnerE
     });
 }
 
+fn mirString(emit: *Emit, tag: Mir.Inst.Tag, inst: Mir.Inst.Index) InnerError!void {
+    const ops = emit.mir.instructions.items(.ops)[inst];
+    switch (ops) {
+        .string => {
+            const data = emit.mir.instructions.items(.data)[inst].string;
+            const mnemonic = switch (tag) {
+                inline .cmps, .lods, .movs, .scas, .stos => |comptime_tag| switch (data.width) {
+                    inline else => |comptime_width| @field(
+                        Instruction.Mnemonic,
+                        @tagName(comptime_tag) ++ @tagName(comptime_width),
+                    ),
+                },
+                else => unreachable,
+            };
+            return emit.encode(mnemonic, .{ .prefix = switch (data.repeat) {
+                inline else => |comptime_repeat| @field(Instruction.Prefix, @tagName(comptime_repeat)),
+            } });
+        },
+        else => unreachable,
+    }
+}
+
 fn mirMovMoffs(emit: *Emit, inst: Mir.Inst.Index) InnerError!void {
     const ops = emit.mir.instructions.items(.ops)[inst];
     const payload = emit.mir.instructions.items(.data)[inst].payload;
@@ -377,10 +396,9 @@ fn mirMovsx(emit: *Emit, inst: Mir.Inst.Index) InnerError!void {
 }
 
 fn mnemonicFromConditionCode(comptime basename: []const u8, cc: bits.Condition) Instruction.Mnemonic {
-    inline for (@typeInfo(bits.Condition).Enum.fields) |field| {
-        if (mem.eql(u8, field.name, @tagName(cc)))
-            return @field(Instruction.Mnemonic, basename ++ field.name);
-    } else unreachable;
+    return switch (cc) {
+        inline else => |comptime_cc| @field(Instruction.Mnemonic, basename ++ @tagName(comptime_cc)),
+    };
 }
 
 fn mirCmovcc(emit: *Emit, inst: Mir.Inst.Index) InnerError!void {
src/arch/x86_64/encoder.zig
@@ -15,10 +15,21 @@ pub const Instruction = struct {
     op2: Operand = .none,
     op3: Operand = .none,
     op4: Operand = .none,
+    prefix: Prefix = .none,
     encoding: Encoding,
 
     pub const Mnemonic = Encoding.Mnemonic;
 
+    pub const Prefix = enum(u3) {
+        none,
+        lock,
+        rep,
+        repe,
+        repz,
+        repne,
+        repnz,
+    };
+
     pub const Operand = union(enum) {
         none,
         reg: Register,
@@ -96,18 +107,16 @@ pub const Instruction = struct {
         }
     };
 
-    pub fn new(mnemonic: Mnemonic, args: struct {
+    pub const Init = struct {
+        prefix: Prefix = .none,
         op1: Operand = .none,
         op2: Operand = .none,
         op3: Operand = .none,
         op4: Operand = .none,
-    }) !Instruction {
-        const encoding = (try Encoding.findByMnemonic(mnemonic, .{
-            .op1 = args.op1,
-            .op2 = args.op2,
-            .op3 = args.op3,
-            .op4 = args.op4,
-        })) orelse {
+    };
+
+    pub fn new(mnemonic: Mnemonic, args: Init) !Instruction {
+        const encoding = (try Encoding.findByMnemonic(mnemonic, args)) orelse {
             log.debug("no encoding found for: {s} {s} {s} {s} {s}", .{
                 @tagName(mnemonic),
                 @tagName(Encoding.Op.fromOperand(args.op1)),
@@ -119,6 +128,7 @@ pub const Instruction = struct {
         };
         log.debug("selected encoding: {}", .{encoding});
         return .{
+            .prefix = args.prefix,
             .op1 = args.op1,
             .op2 = args.op2,
             .op3 = args.op3,
@@ -128,6 +138,7 @@ pub const Instruction = struct {
     }
 
     pub fn fmtPrint(inst: Instruction, writer: anytype) !void {
+        if (inst.prefix != .none) try writer.print("{s} ", .{@tagName(inst.prefix)});
         try writer.print("{s}", .{@tagName(inst.encoding.mnemonic)});
         const ops = [_]struct { Operand, Encoding.Op }{
             .{ inst.op1, inst.encoding.op1 },
@@ -215,6 +226,14 @@ pub const Instruction = struct {
         const op_en = enc.op_en;
 
         var legacy = LegacyPrefixes{};
+
+        switch (inst.prefix) {
+            .none => {},
+            .lock => legacy.prefix_f0 = true,
+            .repne, .repnz => legacy.prefix_f2 = true,
+            .rep, .repe, .repz => legacy.prefix_f3 = true,
+        }
+
         if (enc.mode == .none) {
             const bit_size = enc.operandBitSize();
             if (bit_size == 16) {
@@ -811,15 +830,11 @@ const TestEncode = struct {
     buffer: [32]u8 = undefined,
     index: usize = 0,
 
-    fn encode(enc: *TestEncode, mnemonic: Instruction.Mnemonic, args: struct {
-        op1: Instruction.Operand = .none,
-        op2: Instruction.Operand = .none,
-        op3: Instruction.Operand = .none,
-        op4: Instruction.Operand = .none,
-    }) !void {
+    fn encode(enc: *TestEncode, mnemonic: Instruction.Mnemonic, args: Instruction.Init) !void {
         var stream = std.io.fixedBufferStream(&enc.buffer);
         var count_writer = std.io.countingWriter(stream.writer());
         const inst = try Instruction.new(mnemonic, .{
+            .prefix = args.prefix,
             .op1 = args.op1,
             .op2 = args.op2,
             .op3 = args.op3,
@@ -1447,18 +1462,8 @@ test "lower NP encoding" {
     try expectEqualHexStrings("\x0f\x05", enc.code(), "syscall");
 }
 
-fn invalidInstruction(mnemonic: Instruction.Mnemonic, args: struct {
-    op1: Instruction.Operand = .none,
-    op2: Instruction.Operand = .none,
-    op3: Instruction.Operand = .none,
-    op4: Instruction.Operand = .none,
-}) !void {
-    const err = Instruction.new(mnemonic, .{
-        .op1 = args.op1,
-        .op2 = args.op2,
-        .op3 = args.op3,
-        .op4 = args.op4,
-    });
+fn invalidInstruction(mnemonic: Instruction.Mnemonic, args: Instruction.Init) !void {
+    const err = Instruction.new(mnemonic, args);
     try testing.expectError(error.InvalidInstruction, err);
 }
 
@@ -1479,18 +1484,8 @@ test "invalid instruction" {
     try invalidInstruction(.push, .{ .op1 = .{ .imm = Immediate.u(0x1000000000000000) } });
 }
 
-fn cannotEncode(mnemonic: Instruction.Mnemonic, args: struct {
-    op1: Instruction.Operand = .none,
-    op2: Instruction.Operand = .none,
-    op3: Instruction.Operand = .none,
-    op4: Instruction.Operand = .none,
-}) !void {
-    try testing.expectError(error.CannotEncode, Instruction.new(mnemonic, .{
-        .op1 = args.op1,
-        .op2 = args.op2,
-        .op3 = args.op3,
-        .op4 = args.op4,
-    }));
+fn cannotEncode(mnemonic: Instruction.Mnemonic, args: Instruction.Init) !void {
+    try testing.expectError(error.CannotEncode, Instruction.new(mnemonic, args));
 }
 
 test "cannot encode" {
src/arch/x86_64/Encoding.zig
@@ -24,12 +24,7 @@ opc: [7]u8,
 modrm_ext: u3,
 mode: Mode,
 
-pub fn findByMnemonic(mnemonic: Mnemonic, args: struct {
-    op1: Instruction.Operand,
-    op2: Instruction.Operand,
-    op3: Instruction.Operand,
-    op4: Instruction.Operand,
-}) !?Encoding {
+pub fn findByMnemonic(mnemonic: Mnemonic, args: Instruction.Init) !?Encoding {
     const input_op1 = Op.fromOperand(args.op1);
     const input_op2 = Op.fromOperand(args.op2);
     const input_op3 = Op.fromOperand(args.op3);
@@ -109,17 +104,13 @@ pub fn findByMnemonic(mnemonic: Mnemonic, args: struct {
     if (count == 1) return candidates[0];
 
     const EncodingLength = struct {
-        fn estimate(encoding: Encoding, params: struct {
-            op1: Instruction.Operand,
-            op2: Instruction.Operand,
-            op3: Instruction.Operand,
-            op4: Instruction.Operand,
-        }) usize {
+        fn estimate(encoding: Encoding, params: Instruction.Init) usize {
             var inst = Instruction{
                 .op1 = params.op1,
                 .op2 = params.op2,
                 .op3 = params.op3,
                 .op4 = params.op4,
+                .prefix = params.prefix,
                 .encoding = encoding,
             };
             var cwriter = std.io.countingWriter(std.io.null_writer);
@@ -140,12 +131,7 @@ pub fn findByMnemonic(mnemonic: Mnemonic, args: struct {
             else => {},
         }
 
-        const len = EncodingLength.estimate(candidate, .{
-            .op1 = args.op1,
-            .op2 = args.op2,
-            .op3 = args.op3,
-            .op4 = args.op4,
-        });
+        const len = EncodingLength.estimate(candidate, args);
         const current = shortest_encoding orelse {
             shortest_encoding = .{ .index = i, .len = len };
             continue;
@@ -228,7 +214,11 @@ pub fn modRmExt(encoding: Encoding) u3 {
 }
 
 pub fn operandBitSize(encoding: Encoding) u64 {
-    if (encoding.mode == .long) return 64;
+    switch (encoding.mode) {
+        .short => return 16,
+        .long => return 64,
+        else => {},
+    }
     const bit_size: u64 = switch (encoding.op_en) {
         .np => switch (encoding.op1) {
             .o16 => 16,
@@ -317,10 +307,13 @@ pub const Mnemonic = enum {
     // zig fmt: off
     // General-purpose
     adc, add, @"and",
-    call, cbw, cwde, cdqe, cwd, cdq, cqo, cmp,
+    call, cbw, cdq, cdqe,
     cmova, cmovae, cmovb, cmovbe, cmovc, cmove, cmovg, cmovge, cmovl, cmovle, cmovna,
     cmovnae, cmovnb, cmovnbe, cmovnc, cmovne, cmovng, cmovnge, cmovnl, cmovnle, cmovno,
     cmovnp, cmovns, cmovnz, cmovo, cmovp, cmovpe, cmovpo, cmovs, cmovz,
+    cmp,
+    cmps, cmpsb, cmpsd, cmpsq, cmpsw,
+    cqo, cwd, cwde,
     div,
     fisttp, fld,
     idiv, imul, int3,
@@ -328,15 +321,21 @@ pub const Mnemonic = enum {
     jnc, jne, jng, jnge, jnl, jnle, jno, jnp, jns, jnz, jo, jp, jpe, jpo, js, jz,
     jmp, 
     lea,
-    mov, movsx, movsxd, movzx, mul,
+    lods, lodsb, lodsd, lodsq, lodsw,
+    mov,
+    movs, movsb, movsd, movsq, movsw,
+    movsx, movsxd, movzx, mul,
     nop,
     @"or",
     pop, push,
     ret,
-    sal, sar, sbb, shl, shr, sub, syscall,
+    sal, sar, sbb,
+    scas, scasb, scasd, scasq, scasw,
+    shl, shr, sub, syscall,
     seta, setae, setb, setbe, setc, sete, setg, setge, setl, setle, setna, setnae,
     setnb, setnbe, setnc, setne, setng, setnge, setnl, setnle, setno, setnp, setns,
     setnz, seto, setp, setpe, setpo, sets, setz,
+    stos, stosb, stosd, stosq, stosw,
     @"test",
     ud2,
     xor,
@@ -351,10 +350,10 @@ pub const Mnemonic = enum {
     ucomiss,
     // SSE2
     addsd,
-    cmpsd,
+    //cmpsd,
     divsd,
     maxsd, minsd,
-    movq, movsd,
+    movq, //movsd,
     mulsd,
     subsd,
     ucomisd,
@@ -591,6 +590,7 @@ pub const Op = enum {
 
 pub const Mode = enum {
     none,
+    short,
     fpu,
     rex,
     long,
src/arch/x86_64/encodings.zig
@@ -207,6 +207,15 @@ pub const table = &[_]Entry{
     .{ .cmp, .rm, .r32,  .rm32,   .none, .none, &.{ 0x3b }, 0, .none  },
     .{ .cmp, .rm, .r64,  .rm64,   .none, .none, &.{ 0x3b }, 0, .long  },
 
+    .{ .cmps,  .np, .m8,   .m8,   .none, .none, &.{ 0xa6 }, 0, .none  },
+    .{ .cmps,  .np, .m16,  .m16,  .none, .none, &.{ 0xa7 }, 0, .none  },
+    .{ .cmps,  .np, .m32,  .m32,  .none, .none, &.{ 0xa7 }, 0, .none  },
+    .{ .cmps,  .np, .m64,  .m64,  .none, .none, &.{ 0xa7 }, 0, .long  },
+    .{ .cmpsb, .np, .none, .none, .none, .none, &.{ 0xa6 }, 0, .none  },
+    .{ .cmpsw, .np, .none, .none, .none, .none, &.{ 0xa7 }, 0, .short },
+    .{ .cmpsd, .np, .none, .none, .none, .none, &.{ 0xa7 }, 0, .none  },
+    .{ .cmpsq, .np, .none, .none, .none, .none, &.{ 0xa7 }, 0, .long  },
+
     .{ .div, .m, .rm8,  .none, .none, .none, &.{ 0xf6 }, 6, .none  },
     .{ .div, .m, .rm8,  .none, .none, .none, &.{ 0xf6 }, 6, .rex   },
     .{ .div, .m, .rm16, .none, .none, .none, &.{ 0xf7 }, 6, .none  },
@@ -283,6 +292,15 @@ pub const table = &[_]Entry{
     .{ .lea, .rm, .r32, .m, .none, .none, &.{ 0x8d }, 0, .none  },
     .{ .lea, .rm, .r64, .m, .none, .none, &.{ 0x8d }, 0, .long  },
 
+    .{ .lods,  .np, .m8,   .none, .none, .none, &.{ 0xac }, 0, .none  },
+    .{ .lods,  .np, .m16,  .none, .none, .none, &.{ 0xad }, 0, .none  },
+    .{ .lods,  .np, .m32,  .none, .none, .none, &.{ 0xad }, 0, .none  },
+    .{ .lods,  .np, .m64,  .none, .none, .none, &.{ 0xad }, 0, .long  },
+    .{ .lodsb, .np, .none, .none, .none, .none, &.{ 0xac }, 0, .none  },
+    .{ .lodsw, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .short },
+    .{ .lodsd, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .none  },
+    .{ .lodsq, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .long  },
+
     .{ .mov, .mr, .rm8,   .r8,     .none, .none, &.{ 0x88 }, 0, .none  },
     .{ .mov, .mr, .rm8,   .r8,     .none, .none, &.{ 0x88 }, 0, .rex   },
     .{ .mov, .mr, .rm16,  .r16,    .none, .none, &.{ 0x89 }, 0, .none  },
@@ -316,6 +334,15 @@ pub const table = &[_]Entry{
     .{ .mov, .mi, .rm32,  .imm32,  .none, .none, &.{ 0xc7 }, 0, .none  },
     .{ .mov, .mi, .rm64,  .imm32s, .none, .none, &.{ 0xc7 }, 0, .long  },
 
+    .{ .movs,  .np, .m8,   .m8,   .none, .none, &.{ 0xa4 }, 0, .none  },
+    .{ .movs,  .np, .m16,  .m16,  .none, .none, &.{ 0xa5 }, 0, .none  },
+    .{ .movs,  .np, .m32,  .m32,  .none, .none, &.{ 0xa5 }, 0, .none  },
+    .{ .movs,  .np, .m64,  .m64,  .none, .none, &.{ 0xa5 }, 0, .long  },
+    .{ .movsb, .np, .none, .none, .none, .none, &.{ 0xa4 }, 0, .none  },
+    .{ .movsw, .np, .none, .none, .none, .none, &.{ 0xa5 }, 0, .short },
+    .{ .movsd, .np, .none, .none, .none, .none, &.{ 0xa5 }, 0, .none  },
+    .{ .movsq, .np, .none, .none, .none, .none, &.{ 0xa5 }, 0, .long  },
+
     .{ .movsx, .rm, .r16, .rm8,  .none, .none, &.{ 0x0f, 0xbe }, 0, .none  },
     .{ .movsx, .rm, .r16, .rm8,  .none, .none, &.{ 0x0f, 0xbe }, 0, .rex   },
     .{ .movsx, .rm, .r32, .rm8,  .none, .none, &.{ 0x0f, 0xbe }, 0, .none  },
@@ -435,6 +462,15 @@ pub const table = &[_]Entry{
     .{ .sbb, .rm, .r32,  .rm32,   .none, .none, &.{ 0x1b }, 0, .none  },
     .{ .sbb, .rm, .r64,  .rm64,   .none, .none, &.{ 0x1b }, 0, .long  },
 
+    .{ .scas,  .np, .m8,   .none, .none, .none, &.{ 0xae }, 0, .none  },
+    .{ .scas,  .np, .m16,  .none, .none, .none, &.{ 0xaf }, 0, .none  },
+    .{ .scas,  .np, .m32,  .none, .none, .none, &.{ 0xaf }, 0, .none  },
+    .{ .scas,  .np, .m64,  .none, .none, .none, &.{ 0xaf }, 0, .long  },
+    .{ .scasb, .np, .none, .none, .none, .none, &.{ 0xae }, 0, .none  },
+    .{ .scasw, .np, .none, .none, .none, .none, &.{ 0xaf }, 0, .short },
+    .{ .scasd, .np, .none, .none, .none, .none, &.{ 0xaf }, 0, .none  },
+    .{ .scasq, .np, .none, .none, .none, .none, &.{ 0xaf }, 0, .long  },
+
     .{ .seta,   .m, .rm8, .none, .none, .none, &.{ 0x0f, 0x97 }, 0, .none },
     .{ .seta,   .m, .rm8, .none, .none, .none, &.{ 0x0f, 0x97 }, 0, .rex  },
     .{ .setae,  .m, .rm8, .none, .none, .none, &.{ 0x0f, 0x93 }, 0, .none },
@@ -528,6 +564,15 @@ pub const table = &[_]Entry{
     .{ .shr, .mi, .rm32, .imm8,  .none, .none, &.{ 0xc1 }, 5, .none  },
     .{ .shr, .mi, .rm64, .imm8,  .none, .none, &.{ 0xc1 }, 5, .long  },
 
+    .{ .stos,  .np, .m8,   .none, .none, .none, &.{ 0xaa }, 0, .none  },
+    .{ .stos,  .np, .m16,  .none, .none, .none, &.{ 0xab }, 0, .none  },
+    .{ .stos,  .np, .m32,  .none, .none, .none, &.{ 0xab }, 0, .none  },
+    .{ .stos,  .np, .m64,  .none, .none, .none, &.{ 0xab }, 0, .long  },
+    .{ .stosb, .np, .none, .none, .none, .none, &.{ 0xaa }, 0, .none  },
+    .{ .stosw, .np, .none, .none, .none, .none, &.{ 0xab }, 0, .short },
+    .{ .stosd, .np, .none, .none, .none, .none, &.{ 0xab }, 0, .none  },
+    .{ .stosq, .np, .none, .none, .none, .none, &.{ 0xab }, 0, .long  },
+
     .{ .sub, .zi, .al,   .imm8,   .none, .none, &.{ 0x2c }, 0, .none  },
     .{ .sub, .zi, .ax,   .imm16,  .none, .none, &.{ 0x2d }, 0, .none  },
     .{ .sub, .zi, .eax,  .imm32,  .none, .none, &.{ 0x2d }, 0, .none  },
src/arch/x86_64/Mir.zig
@@ -150,6 +150,17 @@ pub const Inst = struct {
         /// Unordered compare scalar double-precision floating-point values
         ucomisd,
 
+        /// Compare string operands
+        cmps,
+        /// Load string
+        lods,
+        /// Move data from string to string
+        movs,
+        /// Scan string
+        scas,
+        /// Store string
+        stos,
+
         /// Conditional move
         cmovcc,
         /// Conditional jump
@@ -268,6 +279,30 @@ pub const Inst = struct {
         /// Memory (RIP), register operands.
         /// Uses `rx` payload with extra data of type `MemoryRip`.
         mr_rip,
+        /// Single memory (SIB) operand with lock prefix.
+        /// Uses `payload` with extra data of type `MemorySib`.
+        lock_m_sib,
+        /// Single memory (RIP) operand with lock prefix.
+        /// Uses `payload` with extra data of type `MemoryRip`.
+        lock_m_rip,
+        /// Memory (SIB), immediate (unsigned) operands with lock prefix.
+        /// Uses `xi` payload with extra data of type `MemorySib`.
+        lock_mi_u_sib,
+        /// Memory (RIP), immediate (unsigned) operands with lock prefix.
+        /// Uses `xi` payload with extra data of type `MemoryRip`.
+        lock_mi_u_rip,
+        /// Memory (SIB), immediate (sign-extend) operands with lock prefix.
+        /// Uses `xi` payload with extra data of type `MemorySib`.
+        lock_mi_s_sib,
+        /// Memory (RIP), immediate (sign-extend) operands with lock prefix.
+        /// Uses `xi` payload with extra data of type `MemoryRip`.
+        lock_mi_s_rip,
+        /// Memory (SIB), register operands with lock prefix.
+        /// Uses `rx` payload with extra data of type `MemorySib`.
+        lock_mr_sib,
+        /// Memory (RIP), register operands with lock prefix.
+        /// Uses `rx` payload with extra data of type `MemoryRip`.
+        lock_mr_rip,
         /// Rax, Memory moffs.
         /// Uses `payload` with extra data of type `MemoryMoffs`.
         rax_moffs,
@@ -280,6 +315,9 @@ pub const Inst = struct {
         /// References another Mir instruction directly with condition code (CC).
         /// Uses `inst_cc` payload.
         inst_cc,
+        /// String repeat and width
+        /// Uses `string` payload.
+        string,
         /// Uses `reloc` payload.
         reloc,
         /// Linker relocation - GOT indirection.
@@ -353,6 +391,11 @@ pub const Inst = struct {
             payload: u32,
             imm: u32,
         },
+        /// String instruction prefix and width.
+        string: struct {
+            repeat: bits.StringRepeat,
+            width: bits.StringWidth,
+        },
         /// Relocation for the linker where:
         /// * `atom_index` is the index of the source
         /// * `sym_index` is the index of the target