Commit c58b5732f3

Jacob Young <jacobly0@users.noreply.github.com>
2023-03-21 04:02:31
x86_64: implement @byteSwap and @bitReverse
1 parent f316cb2
src/arch/x86_64/bits.zig
@@ -472,7 +472,7 @@ pub const Memory = union(enum) {
     }
 
     pub fn sib(ptr_size: PtrSize, args: struct {
-        disp: i32,
+        disp: i32 = 0,
         base: ?Register = null,
         scale_index: ?ScaleIndex = null,
     }) Memory {
src/arch/x86_64/CodeGen.zig
@@ -2595,10 +2595,7 @@ fn airPtrElemVal(self: *Self, inst: Air.Inst.Index) !void {
             try self.asmRegisterMemory(
                 .mov,
                 registerAlias(dst_mcv.register, elem_abi_size),
-                Memory.sib(Memory.PtrSize.fromSize(elem_abi_size), .{
-                    .base = dst_mcv.register,
-                    .disp = 0,
-                }),
+                Memory.sib(Memory.PtrSize.fromSize(elem_abi_size), .{ .base = dst_mcv.register }),
             );
             break :result .{ .register = registerAlias(dst_mcv.register, @intCast(u32, elem_abi_size)) };
         }
@@ -2956,21 +2953,197 @@ fn airPopcount(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
+fn byteSwap(self: *Self, inst: Air.Inst.Index, src_ty: Type, src_mcv: MCValue, mem_ok: bool) !MCValue {
+    const ty_op = self.air.instructions.items(.data)[inst].ty_op;
+
+    const src_bits = self.regBitSize(src_ty);
+    const src_lock = switch (src_mcv) {
+        .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
+        else => null,
+    };
+    defer if (src_lock) |lock| self.register_manager.unlockReg(lock);
+
+    switch (src_bits) {
+        else => unreachable,
+        8 => return if ((mem_ok or src_mcv.isRegister()) and
+            self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+            src_mcv
+        else
+            try self.copyToRegisterWithInstTracking(inst, src_ty, src_mcv),
+        16 => if ((mem_ok or src_mcv.isRegister()) and
+            self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+        {
+            try self.genBinOpMir(.rol, src_ty, src_mcv, .{ .immediate = 8 });
+            return src_mcv;
+        },
+        32, 64 => if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
+            try self.genUnOpMir(.bswap, src_ty, src_mcv);
+            return src_mcv;
+        },
+    }
+
+    if (src_mcv.isRegister()) {
+        const dst_mcv: MCValue = if (mem_ok)
+            try self.allocRegOrMem(inst, true)
+        else
+            .{ .register = try self.register_manager.allocReg(inst, gp) };
+        if (dst_mcv.isRegister()) {
+            const dst_lock = self.register_manager.lockRegAssumeUnused(dst_mcv.register);
+            defer self.register_manager.unlockReg(dst_lock);
+
+            try self.genSetReg(src_ty, dst_mcv.register, src_mcv);
+            switch (src_bits) {
+                else => unreachable,
+                16 => try self.genBinOpMir(.rol, src_ty, dst_mcv, .{ .immediate = 8 }),
+                32, 64 => try self.genUnOpMir(.bswap, src_ty, dst_mcv),
+            }
+        } else try self.genBinOpMir(.movbe, src_ty, dst_mcv, src_mcv);
+        return dst_mcv;
+    }
+
+    const dst_reg = try self.register_manager.allocReg(inst, gp);
+    const dst_mcv = MCValue{ .register = dst_reg };
+    const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
+    defer self.register_manager.unlockReg(dst_lock);
+
+    try self.genBinOpMir(.movbe, src_ty, dst_mcv, src_mcv);
+    return dst_mcv;
+}
+
 fn airByteSwap(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst))
-        .dead
-    else
-        return self.fail("TODO implement airByteSwap for {}", .{self.target.cpu.arch});
+    const result = result: {
+        if (self.liveness.isUnused(inst)) break :result .dead;
+
+        const src_ty = self.air.typeOf(ty_op.operand);
+        const src_mcv = try self.resolveInst(ty_op.operand);
+
+        const dst_mcv = try self.byteSwap(inst, src_ty, src_mcv, true);
+        switch (self.regExtraBits(src_ty)) {
+            0 => {},
+            else => |extra| try self.genBinOpMir(
+                if (src_ty.isSignedInt()) .sar else .shr,
+                src_ty,
+                dst_mcv,
+                .{ .immediate = extra },
+            ),
+        }
+        break :result dst_mcv;
+    };
+
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
 fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst))
-        .dead
-    else
-        return self.fail("TODO implement airBitReverse for {}", .{self.target.cpu.arch});
+    const result = result: {
+        if (self.liveness.isUnused(inst)) break :result .dead;
+
+        const src_ty = self.air.typeOf(ty_op.operand);
+        const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*));
+        const src_mcv = try self.resolveInst(ty_op.operand);
+
+        const dst_mcv = try self.byteSwap(inst, src_ty, src_mcv, false);
+        const dst_reg = dst_mcv.register;
+        const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
+        defer self.register_manager.unlockReg(dst_lock);
+
+        const tmp_reg = try self.register_manager.allocReg(null, gp);
+        const tmp_lock = self.register_manager.lockReg(tmp_reg);
+        defer if (tmp_lock) |lock| self.register_manager.unlockReg(lock);
+
+        {
+            const dst = registerAlias(dst_reg, src_abi_size);
+            const tmp = registerAlias(tmp_reg, src_abi_size);
+            const imm = if (src_abi_size > 4)
+                try self.register_manager.allocReg(null, gp)
+            else
+                undefined;
+
+            const mask = @as(u64, math.maxInt(u64)) >> @intCast(u6, 64 - src_abi_size * 8);
+            const imm_0000_1111 = Immediate.u(mask / 0b0001_0001);
+            const imm_00_11 = Immediate.u(mask / 0b01_01);
+            const imm_0_1 = Immediate.u(mask / 0b1_1);
+
+            // dst = temp1 = bswap(operand)
+            try self.asmRegisterRegister(.mov, tmp, dst);
+            // tmp = temp1
+            try self.asmRegisterImmediate(.shr, dst, Immediate.u(4));
+            // dst = temp1 >> 4
+            if (src_abi_size > 4) {
+                try self.asmRegisterImmediate(.mov, imm, imm_0000_1111);
+                try self.asmRegisterRegister(.@"and", tmp, imm);
+                try self.asmRegisterRegister(.@"and", dst, imm);
+            } else {
+                try self.asmRegisterImmediate(.@"and", tmp, imm_0000_1111);
+                try self.asmRegisterImmediate(.@"and", dst, imm_0000_1111);
+            }
+            // tmp = temp1 & 0x0F...0F
+            // dst = (temp1 >> 4) & 0x0F...0F
+            try self.asmRegisterImmediate(.shl, tmp, Immediate.u(4));
+            // tmp = (temp1 & 0x0F...0F) << 4
+            try self.asmRegisterRegister(.@"or", dst, tmp);
+            // dst = temp2 = ((temp1 >> 4) & 0x0F...0F) | ((temp1 & 0x0F...0F) << 4)
+            try self.asmRegisterRegister(.mov, tmp, dst);
+            // tmp = temp2
+            try self.asmRegisterImmediate(.shr, dst, Immediate.u(2));
+            // dst = temp2 >> 2
+            if (src_abi_size > 4) {
+                try self.asmRegisterImmediate(.mov, imm, imm_00_11);
+                try self.asmRegisterRegister(.@"and", tmp, imm);
+                try self.asmRegisterRegister(.@"and", dst, imm);
+            } else {
+                try self.asmRegisterImmediate(.@"and", tmp, imm_00_11);
+                try self.asmRegisterImmediate(.@"and", dst, imm_00_11);
+            }
+            // tmp = temp2 & 0x33...33
+            // dst = (temp2 >> 2) & 0x33...33
+            try self.asmRegisterMemory(
+                .lea,
+                if (src_abi_size > 4) tmp.to64() else tmp.to32(),
+                Memory.sib(.qword, .{
+                    .base = dst.to64(),
+                    .scale_index = .{ .index = tmp.to64(), .scale = 1 << 2 },
+                }),
+            );
+            // tmp = temp3 = ((temp2 >> 2) & 0x33...33) + ((temp2 & 0x33...33) << 2)
+            try self.asmRegisterRegister(.mov, dst, tmp);
+            // dst = temp3
+            try self.asmRegisterImmediate(.shr, tmp, Immediate.u(1));
+            // tmp = temp3 >> 1
+            if (src_abi_size > 4) {
+                try self.asmRegisterImmediate(.mov, imm, imm_0_1);
+                try self.asmRegisterRegister(.@"and", dst, imm);
+                try self.asmRegisterRegister(.@"and", tmp, imm);
+            } else {
+                try self.asmRegisterImmediate(.@"and", dst, imm_0_1);
+                try self.asmRegisterImmediate(.@"and", tmp, imm_0_1);
+            }
+            // dst = temp3 & 0x55...55
+            // tmp = (temp3 >> 1) & 0x55...55
+            try self.asmRegisterMemory(
+                .lea,
+                if (src_abi_size > 4) dst.to64() else dst.to32(),
+                Memory.sib(.qword, .{
+                    .base = tmp.to64(),
+                    .scale_index = .{ .index = dst.to64(), .scale = 1 << 1 },
+                }),
+            );
+            // dst = ((temp3 >> 1) & 0x55...55) + ((temp3 & 0x55...55) << 1)
+        }
+
+        switch (self.regExtraBits(src_ty)) {
+            0 => {},
+            else => |extra| try self.genBinOpMir(
+                if (src_ty.isSignedInt()) .sar else .shr,
+                src_ty,
+                dst_mcv,
+                .{ .immediate = extra },
+            ),
+        }
+        break :result dst_mcv;
+    };
+
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
@@ -3052,7 +3225,7 @@ fn load(self: *Self, dst_mcv: MCValue, ptr: MCValue, ptr_ty: Type) InnerError!vo
                     try self.asmRegisterMemory(
                         .mov,
                         registerAlias(dst_reg, abi_size),
-                        Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = reg, .disp = 0 }),
+                        Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = reg }),
                     );
                 },
                 .stack_offset => |off| {
@@ -3167,7 +3340,7 @@ fn store(self: *Self, ptr: MCValue, value: MCValue, ptr_ty: Type, value_ty: Type
                 .eflags => |cc| {
                     try self.asmSetccMemory(Memory.sib(
                         Memory.PtrSize.fromSize(abi_size),
-                        .{ .base = reg.to64(), .disp = 0 },
+                        .{ .base = reg.to64() },
                     ), cc);
                 },
                 .undef => {
@@ -3187,10 +3360,10 @@ fn store(self: *Self, ptr: MCValue, value: MCValue, ptr_ty: Type, value_ty: Type
                                 Immediate.s(@intCast(i32, @bitCast(i64, imm)))
                             else
                                 Immediate.u(@truncate(u32, imm));
-                            try self.asmMemoryImmediate(.mov, Memory.sib(Memory.PtrSize.fromSize(abi_size), .{
-                                .base = reg.to64(),
-                                .disp = 0,
-                            }), immediate);
+                            try self.asmMemoryImmediate(.mov, Memory.sib(
+                                Memory.PtrSize.fromSize(abi_size),
+                                .{ .base = reg.to64() },
+                            ), immediate);
                         },
                         8 => {
                             // TODO: optimization: if the imm is only using the lower
@@ -3262,10 +3435,11 @@ fn store(self: *Self, ptr: MCValue, value: MCValue, ptr_ty: Type, value_ty: Type
             try self.loadMemPtrIntoRegister(addr_reg, ptr_ty, ptr);
 
             // To get the actual address of the value we want to modify we have to go through the GOT
-            try self.asmRegisterMemory(.mov, addr_reg.to64(), Memory.sib(.qword, .{
-                .base = addr_reg.to64(),
-                .disp = 0,
-            }));
+            try self.asmRegisterMemory(
+                .mov,
+                addr_reg.to64(),
+                Memory.sib(.qword, .{ .base = addr_reg.to64() }),
+            );
 
             const new_ptr = MCValue{ .register = addr_reg.to64() };
 
@@ -3287,10 +3461,11 @@ fn store(self: *Self, ptr: MCValue, value: MCValue, ptr_ty: Type, value_ty: Type
                             return self.fail("TODO imm64 would get incorrectly sign extended", .{});
                         }
                     }
-                    try self.asmMemoryImmediate(.mov, Memory.sib(Memory.PtrSize.fromSize(abi_size), .{
-                        .base = addr_reg.to64(),
-                        .disp = 0,
-                    }), Immediate.u(@intCast(u32, imm)));
+                    try self.asmMemoryImmediate(
+                        .mov,
+                        Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = addr_reg.to64() }),
+                        Immediate.u(@intCast(u32, imm)),
+                    );
                 },
                 .register => {
                     return self.store(new_ptr, value, ptr_ty, value_ty);
@@ -3302,10 +3477,11 @@ fn store(self: *Self, ptr: MCValue, value: MCValue, ptr_ty: Type, value_ty: Type
                         defer self.register_manager.unlockReg(tmp_reg_lock);
 
                         try self.loadMemPtrIntoRegister(tmp_reg, value_ty, value);
-                        try self.asmRegisterMemory(.mov, tmp_reg, Memory.sib(.qword, .{
-                            .base = tmp_reg,
-                            .disp = 0,
-                        }));
+                        try self.asmRegisterMemory(
+                            .mov,
+                            tmp_reg,
+                            Memory.sib(.qword, .{ .base = tmp_reg }),
+                        );
 
                         return self.store(new_ptr, .{ .register = tmp_reg }, ptr_ty, value_ty);
                     }
@@ -3604,15 +3780,16 @@ fn genUnOpMir(self: *Self, mir_tag: Mir.Inst.Tag, dst_ty: Type, dst_mcv: MCValue
             try self.loadMemPtrIntoRegister(addr_reg, Type.usize, dst_mcv);
 
             // To get the actual address of the value we want to modify we have to go through the GOT
-            try self.asmRegisterMemory(.mov, addr_reg, Memory.sib(.qword, .{
-                .base = addr_reg,
-                .disp = 0,
-            }));
+            try self.asmRegisterMemory(
+                .mov,
+                addr_reg,
+                Memory.sib(.qword, .{ .base = addr_reg }),
+            );
 
-            try self.asmMemory(mir_tag, Memory.sib(Memory.PtrSize.fromSize(abi_size), .{
-                .base = addr_reg,
-                .disp = 0,
-            }));
+            try self.asmMemory(
+                mir_tag,
+                Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = addr_reg }),
+            );
         },
     }
 }
@@ -4117,17 +4294,15 @@ fn genBinOp(
 
                             // To get the actual address of the value we want to modify we
                             // we have to go through the GOT
-                            try self.asmRegisterMemory(.mov, addr_reg, Memory.sib(.qword, .{
-                                .base = addr_reg,
-                                .disp = 0,
-                            }));
+                            try self.asmRegisterMemory(
+                                .mov,
+                                addr_reg,
+                                Memory.sib(.qword, .{ .base = addr_reg }),
+                            );
 
                             try self.asmCmovccRegisterMemory(
                                 registerAlias(dst_reg, abi_size),
-                                Memory.sib(Memory.PtrSize.fromSize(abi_size), .{
-                                    .base = addr_reg,
-                                    .disp = 0,
-                                }),
+                                Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = addr_reg }),
                                 cc,
                             );
                         },
@@ -5175,10 +5350,11 @@ fn isNull(self: *Self, inst: Air.Inst.Index, opt_ty: Type, opt_mcv: MCValue) !MC
             try self.loadMemPtrIntoRegister(addr_reg, Type.usize, opt_mcv);
 
             // To get the actual address of the value we want to modify we have to go through the GOT
-            try self.asmRegisterMemory(.mov, addr_reg, Memory.sib(.qword, .{
-                .base = addr_reg,
-                .disp = 0,
-            }));
+            try self.asmRegisterMemory(
+                .mov,
+                addr_reg,
+                Memory.sib(.qword, .{ .base = addr_reg }),
+            );
 
             const some_abi_size = @intCast(u32, some_info.ty.abiSize(self.target.*));
             try self.asmMemoryImmediate(.cmp, Memory.sib(
@@ -6374,10 +6550,11 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                             .f64 => .qword,
                             else => unreachable,
                         };
-                        return self.asmRegisterMemory(tag, reg.to128(), Memory.sib(ptr_size, .{
-                            .base = base_reg.to64(),
-                            .disp = 0,
-                        }));
+                        return self.asmRegisterMemory(
+                            tag,
+                            reg.to128(),
+                            Memory.sib(ptr_size, .{ .base = base_reg.to64() }),
+                        );
                     }
 
                     return self.fail("TODO genSetReg from memory for float with no intrinsics", .{});
@@ -6387,7 +6564,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     try self.asmRegisterMemory(
                         .mov,
                         registerAlias(reg, abi_size),
-                        Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = reg.to64(), .disp = 0 }),
+                        Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = reg.to64() }),
                     );
                 },
             }
@@ -6408,10 +6585,11 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                         .f64 => .qword,
                         else => unreachable,
                     };
-                    return self.asmRegisterMemory(tag, reg.to128(), Memory.sib(ptr_size, .{
-                        .base = base_reg.to64(),
-                        .disp = 0,
-                    }));
+                    return self.asmRegisterMemory(
+                        tag,
+                        reg.to128(),
+                        Memory.sib(ptr_size, .{ .base = base_reg.to64() }),
+                    );
                 }
 
                 return self.fail("TODO genSetReg from memory for float with no intrinsics", .{});
@@ -6447,7 +6625,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                         try self.asmRegisterMemory(
                             .mov,
                             registerAlias(reg, abi_size),
-                            Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = reg.to64(), .disp = 0 }),
+                            Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = reg.to64() }),
                         );
                     }
                 }
@@ -6638,12 +6816,9 @@ fn airCmpxchg(self: *Self, inst: Air.Inst.Index) !void {
     const val_abi_size = @intCast(u32, val_ty.abiSize(self.target.*));
     const ptr_size = Memory.PtrSize.fromSize(val_abi_size);
     const ptr_mem: Memory = switch (ptr_mcv) {
-        .register => |reg| Memory.sib(ptr_size, .{ .base = reg, .disp = 0 }),
+        .register => |reg| Memory.sib(ptr_size, .{ .base = reg }),
         .ptr_stack_offset => |off| Memory.sib(ptr_size, .{ .base = .rbp, .disp = -off }),
-        else => Memory.sib(ptr_size, .{
-            .base = try self.copyToTmpRegister(ptr_ty, ptr_mcv),
-            .disp = 0,
-        }),
+        else => Memory.sib(ptr_size, .{ .base = try self.copyToTmpRegister(ptr_ty, ptr_mcv) }),
     };
     const mem_lock = if (ptr_mem.base()) |reg| self.register_manager.lockReg(reg) else null;
     defer if (mem_lock) |lock| self.register_manager.unlockReg(lock);
@@ -6692,12 +6867,9 @@ fn atomicOp(
     const val_abi_size = @intCast(u32, val_ty.abiSize(self.target.*));
     const ptr_size = Memory.PtrSize.fromSize(val_abi_size);
     const ptr_mem: Memory = switch (ptr_mcv) {
-        .register => |reg| Memory.sib(ptr_size, .{ .base = reg, .disp = 0 }),
+        .register => |reg| Memory.sib(ptr_size, .{ .base = reg }),
         .ptr_stack_offset => |off| Memory.sib(ptr_size, .{ .base = .rbp, .disp = -off }),
-        else => Memory.sib(ptr_size, .{
-            .base = try self.copyToTmpRegister(ptr_ty, ptr_mcv),
-            .disp = 0,
-        }),
+        else => Memory.sib(ptr_size, .{ .base = try self.copyToTmpRegister(ptr_ty, ptr_mcv) }),
     };
     const mem_lock = if (ptr_mem.base()) |reg| self.register_manager.lockReg(reg) else null;
     defer if (mem_lock) |lock| self.register_manager.unlockReg(lock);
@@ -6861,10 +7033,7 @@ fn airMemcpy(self: *Self, inst: Air.Inst.Index) !void {
             .linker_load, .memory => {
                 const reg = try self.register_manager.allocReg(null, gp);
                 try self.loadMemPtrIntoRegister(reg, src_ty, src_ptr);
-                try self.asmRegisterMemory(.mov, reg, Memory.sib(.qword, .{
-                    .base = reg,
-                    .disp = 0,
-                }));
+                try self.asmRegisterMemory(.mov, reg, Memory.sib(.qword, .{ .base = reg }));
                 break :blk MCValue{ .register = reg };
             },
             else => break :blk src_ptr,
src/arch/x86_64/Emit.zig
@@ -75,6 +75,7 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .@"and",
             .bsf,
             .bsr,
+            .bswap,
             .bt,
             .btc,
             .btr,
@@ -100,6 +101,7 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .lzcnt,
             .mfence,
             .mov,
+            .movbe,
             .movzx,
             .mul,
             .neg,
@@ -109,7 +111,11 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .pop,
             .popcnt,
             .push,
+            .rcl,
+            .rcr,
             .ret,
+            .rol,
+            .ror,
             .sal,
             .sar,
             .sbb,
src/arch/x86_64/encoder.zig
@@ -211,14 +211,12 @@ pub const Instruction = struct {
 
     fn encodeOpcode(inst: Instruction, encoder: anytype) !void {
         const opcode = inst.encoding.opcode();
+        const first = @boolToInt(inst.encoding.mandatoryPrefix() != null);
+        const final = opcode.len - 1;
+        for (opcode[first..final]) |byte| try encoder.opcode_1byte(byte);
         switch (inst.encoding.op_en) {
-            .o, .oi => try encoder.opcode_withReg(opcode[0], inst.op1.reg.lowEnc()),
-            else => {
-                const index: usize = if (inst.encoding.mandatoryPrefix()) |_| 1 else 0;
-                for (opcode[index..]) |byte| {
-                    try encoder.opcode_1byte(byte);
-                }
-            },
+            .o, .oi => try encoder.opcode_withReg(opcode[final], inst.op1.reg.lowEnc()),
+            else => try encoder.opcode_1byte(opcode[final]),
         }
     }
 
@@ -896,10 +894,10 @@ test "lower MI encoding" {
     try enc.encode(.mov, .{ .op1 = .{ .reg = .r12 }, .op2 = .{ .imm = Immediate.u(0x1000) } });
     try expectEqualHexStrings("\x49\xC7\xC4\x00\x10\x00\x00", enc.code(), "mov r12, 0x1000");
 
-    try enc.encode(.mov, .{ .op1 = .{ .mem = Memory.sib(.byte, .{
-        .base = .r12,
-        .disp = 0,
-    }) }, .op2 = .{ .imm = Immediate.u(0x10) } });
+    try enc.encode(.mov, .{
+        .op1 = .{ .mem = Memory.sib(.byte, .{ .base = .r12 }) },
+        .op2 = .{ .imm = Immediate.u(0x10) },
+    });
     try expectEqualHexStrings("\x41\xC6\x04\x24\x10", enc.code(), "mov BYTE PTR [r12], 0x10");
 
     try enc.encode(.mov, .{ .op1 = .{ .reg = .r12 }, .op2 = .{ .imm = Immediate.u(0x1000) } });
@@ -911,10 +909,10 @@ test "lower MI encoding" {
     try enc.encode(.mov, .{ .op1 = .{ .reg = .rax }, .op2 = .{ .imm = Immediate.u(0x10) } });
     try expectEqualHexStrings("\x48\xc7\xc0\x10\x00\x00\x00", enc.code(), "mov rax, 0x10");
 
-    try enc.encode(.mov, .{ .op1 = .{ .mem = Memory.sib(.dword, .{
-        .base = .r11,
-        .disp = 0,
-    }) }, .op2 = .{ .imm = Immediate.u(0x10) } });
+    try enc.encode(.mov, .{
+        .op1 = .{ .mem = Memory.sib(.dword, .{ .base = .r11 }) },
+        .op2 = .{ .imm = Immediate.u(0x10) },
+    });
     try expectEqualHexStrings("\x41\xc7\x03\x10\x00\x00\x00", enc.code(), "mov DWORD PTR [r11], 0x10");
 
     try enc.encode(.mov, .{
@@ -1030,10 +1028,10 @@ test "lower MI encoding" {
 test "lower RM encoding" {
     var enc = TestEncode{};
 
-    try enc.encode(.mov, .{ .op1 = .{ .reg = .rax }, .op2 = .{ .mem = Memory.sib(.qword, .{
-        .base = .r11,
-        .disp = 0,
-    }) } });
+    try enc.encode(.mov, .{
+        .op1 = .{ .reg = .rax },
+        .op2 = .{ .mem = Memory.sib(.qword, .{ .base = .r11 }) },
+    });
     try expectEqualHexStrings("\x49\x8b\x03", enc.code(), "mov rax, QWORD PTR [r11]");
 
     try enc.encode(.mov, .{ .op1 = .{ .reg = .rbx }, .op2 = .{ .mem = Memory.sib(.qword, .{
@@ -1116,20 +1114,16 @@ test "lower RM encoding" {
     try enc.encode(.movsx, .{ .op1 = .{ .reg = .ax }, .op2 = .{ .reg = .bl } });
     try expectEqualHexStrings("\x66\x0F\xBE\xC3", enc.code(), "movsx ax, bl");
 
-    try enc.encode(.movsx, .{ .op1 = .{ .reg = .eax }, .op2 = .{ .mem = Memory.sib(.word, .{
-        .base = .rbp,
-        .disp = 0,
-    }) } });
+    try enc.encode(.movsx, .{
+        .op1 = .{ .reg = .eax },
+        .op2 = .{ .mem = Memory.sib(.word, .{ .base = .rbp }) },
+    });
     try expectEqualHexStrings("\x0F\xBF\x45\x00", enc.code(), "movsx eax, BYTE PTR [rbp]");
 
-    try enc.encode(.movsx, .{ .op1 = .{ .reg = .eax }, .op2 = .{ .mem = Memory.sib(.byte, .{
-        .base = null,
-        .scale_index = .{
-            .index = .rax,
-            .scale = 2,
-        },
-        .disp = 0,
-    }) } });
+    try enc.encode(.movsx, .{
+        .op1 = .{ .reg = .eax },
+        .op2 = .{ .mem = Memory.sib(.byte, .{ .scale_index = .{ .index = .rax, .scale = 2 } }) },
+    });
     try expectEqualHexStrings("\x0F\xBE\x04\x45\x00\x00\x00\x00", enc.code(), "movsx eax, BYTE PTR [rax * 2]");
 
     try enc.encode(.movsx, .{ .op1 = .{ .reg = .ax }, .op2 = .{ .mem = Memory.rip(.byte, 0x10) } });
@@ -1156,14 +1150,13 @@ test "lower RM encoding" {
     try enc.encode(.lea, .{ .op1 = .{ .reg = .ax }, .op2 = .{ .mem = Memory.rip(.byte, 0x10) } });
     try expectEqualHexStrings("\x66\x8D\x05\x10\x00\x00\x00", enc.code(), "lea ax, BYTE PTR [rip + 0x10]");
 
-    try enc.encode(.lea, .{ .op1 = .{ .reg = .rsi }, .op2 = .{ .mem = Memory.sib(.qword, .{
-        .base = .rbp,
-        .scale_index = .{
-            .scale = 1,
-            .index = .rcx,
-        },
-        .disp = 0,
-    }) } });
+    try enc.encode(.lea, .{
+        .op1 = .{ .reg = .rsi },
+        .op2 = .{ .mem = Memory.sib(.qword, .{
+            .base = .rbp,
+            .scale_index = .{ .scale = 1, .index = .rcx },
+        }) },
+    });
     try expectEqualHexStrings("\x48\x8D\x74\x0D\x00", enc.code(), "lea rsi, QWORD PTR [rbp + rcx*1 + 0]");
 
     try enc.encode(.add, .{ .op1 = .{ .reg = .r11 }, .op2 = .{ .mem = Memory.sib(.qword, .{
@@ -1319,51 +1312,35 @@ test "lower M encoding" {
     try enc.encode(.call, .{ .op1 = .{ .reg = .r12 } });
     try expectEqualHexStrings("\x41\xFF\xD4", enc.code(), "call r12");
 
-    try enc.encode(.call, .{ .op1 = .{ .mem = Memory.sib(.qword, .{
-        .base = .r12,
-        .disp = 0,
-    }) } });
+    try enc.encode(.call, .{ .op1 = .{ .mem = Memory.sib(.qword, .{ .base = .r12 }) } });
     try expectEqualHexStrings("\x41\xFF\x14\x24", enc.code(), "call QWORD PTR [r12]");
 
-    try enc.encode(.call, .{ .op1 = .{ .mem = Memory.sib(.qword, .{
-        .base = null,
-        .scale_index = .{
-            .index = .r11,
-            .scale = 2,
-        },
-        .disp = 0,
-    }) } });
+    try enc.encode(.call, .{
+        .op1 = .{ .mem = Memory.sib(.qword, .{
+            .base = null,
+            .scale_index = .{ .index = .r11, .scale = 2 },
+        }) },
+    });
     try expectEqualHexStrings("\x42\xFF\x14\x5D\x00\x00\x00\x00", enc.code(), "call QWORD PTR [r11 * 2]");
 
-    try enc.encode(.call, .{ .op1 = .{ .mem = Memory.sib(.qword, .{
-        .base = null,
-        .scale_index = .{
-            .index = .r12,
-            .scale = 2,
-        },
-        .disp = 0,
-    }) } });
+    try enc.encode(.call, .{
+        .op1 = .{ .mem = Memory.sib(.qword, .{
+            .base = null,
+            .scale_index = .{ .index = .r12, .scale = 2 },
+        }) },
+    });
     try expectEqualHexStrings("\x42\xFF\x14\x65\x00\x00\x00\x00", enc.code(), "call QWORD PTR [r12 * 2]");
 
-    try enc.encode(.call, .{ .op1 = .{ .mem = Memory.sib(.qword, .{
-        .base = .gs,
-        .disp = 0,
-    }) } });
+    try enc.encode(.call, .{ .op1 = .{ .mem = Memory.sib(.qword, .{ .base = .gs }) } });
     try expectEqualHexStrings("\x65\xFF\x14\x25\x00\x00\x00\x00", enc.code(), "call gs:0x0");
 
     try enc.encode(.call, .{ .op1 = .{ .imm = Immediate.s(0) } });
     try expectEqualHexStrings("\xE8\x00\x00\x00\x00", enc.code(), "call 0x0");
 
-    try enc.encode(.push, .{ .op1 = .{ .mem = Memory.sib(.qword, .{
-        .base = .rbp,
-        .disp = 0,
-    }) } });
+    try enc.encode(.push, .{ .op1 = .{ .mem = Memory.sib(.qword, .{ .base = .rbp }) } });
     try expectEqualHexStrings("\xFF\x75\x00", enc.code(), "push QWORD PTR [rbp]");
 
-    try enc.encode(.push, .{ .op1 = .{ .mem = Memory.sib(.word, .{
-        .base = .rbp,
-        .disp = 0,
-    }) } });
+    try enc.encode(.push, .{ .op1 = .{ .mem = Memory.sib(.word, .{ .base = .rbp }) } });
     try expectEqualHexStrings("\x66\xFF\x75\x00", enc.code(), "push QWORD PTR [rbp]");
 
     try enc.encode(.pop, .{ .op1 = .{ .mem = Memory.rip(.qword, 0) } });
@@ -1491,7 +1468,7 @@ fn cannotEncode(mnemonic: Instruction.Mnemonic, args: Instruction.Init) !void {
 
 test "cannot encode" {
     try cannotEncode(.@"test", .{
-        .op1 = .{ .mem = Memory.sib(.byte, .{ .base = .r12, .disp = 0 }) },
+        .op1 = .{ .mem = Memory.sib(.byte, .{ .base = .r12 }) },
         .op2 = .{ .reg = .ah },
     });
     try cannotEncode(.@"test", .{
src/arch/x86_64/Encoding.zig
@@ -307,7 +307,7 @@ pub const Mnemonic = enum {
     // zig fmt: off
     // General-purpose
     adc, add, @"and",
-    bsf, bsr, bt, btc, btr, bts,
+    bsf, bsr, bswap, bt, btc, btr, bts,
     call, cbw, cdq, cdqe,
     cmova, cmovae, cmovb, cmovbe, cmovc, cmove, cmovg, cmovge, cmovl, cmovle, cmovna,
     cmovnae, cmovnb, cmovnbe, cmovnc, cmovne, cmovng, cmovnge, cmovnl, cmovnle, cmovno,
@@ -325,13 +325,13 @@ pub const Mnemonic = enum {
     lea, lfence,
     lods, lodsb, lodsd, lodsq, lodsw,
     lzcnt,
-    mfence, mov,
+    mfence, mov, movbe,
     movs, movsb, movsd, movsq, movsw,
     movsx, movsxd, movzx, mul,
     neg, nop, not,
     @"or",
     pop, popcnt, push,
-    ret,
+    rcl, rcr, ret, rol, ror,
     sal, sar, sbb,
     scas, scasb, scasd, scasq, scasw,
     shl, shr, sub, syscall,
src/arch/x86_64/encodings.zig
@@ -89,6 +89,9 @@ pub const table = &[_]Entry{
     .{ .bsr, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbd }, 0, .none },
     .{ .bsr, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbd }, 0, .long },
 
+    .{ .bswap, .o, .r32, .none, .none, .none, &.{ 0x0f, 0xc8 }, 0, .none },
+    .{ .bswap, .o, .r64, .none, .none, .none, &.{ 0x0f, 0xc8 }, 0, .long },
+
     .{ .bt, .mr, .rm16, .r16,  .none, .none, &.{ 0x0f, 0xa3 }, 0, .none },
     .{ .bt, .mr, .rm32, .r32,  .none, .none, &.{ 0x0f, 0xa3 }, 0, .none },
     .{ .bt, .mr, .rm64, .r64,  .none, .none, &.{ 0x0f, 0xa3 }, 0, .long },
@@ -387,6 +390,13 @@ pub const table = &[_]Entry{
     .{ .mov, .mi, .rm32,  .imm32,  .none, .none, &.{ 0xc7 }, 0, .none  },
     .{ .mov, .mi, .rm64,  .imm32s, .none, .none, &.{ 0xc7 }, 0, .long  },
 
+    .{ .movbe, .rm, .r16, .m16, .none, .none, &.{ 0x0f, 0x38, 0xf0 }, 0, .none },
+    .{ .movbe, .rm, .r32, .m32, .none, .none, &.{ 0x0f, 0x38, 0xf0 }, 0, .none },
+    .{ .movbe, .rm, .r64, .m64, .none, .none, &.{ 0x0f, 0x38, 0xf0 }, 0, .long },
+    .{ .movbe, .mr, .m16, .r16, .none, .none, &.{ 0x0f, 0x38, 0xf1 }, 0, .none },
+    .{ .movbe, .mr, .m32, .r32, .none, .none, &.{ 0x0f, 0x38, 0xf1 }, 0, .none },
+    .{ .movbe, .mr, .m64, .r64, .none, .none, &.{ 0x0f, 0x38, 0xf1 }, 0, .long },
+
     .{ .movs,  .np, .m8,   .m8,   .none, .none, &.{ 0xa4 }, 0, .none  },
     .{ .movs,  .np, .m16,  .m16,  .none, .none, &.{ 0xa5 }, 0, .none  },
     .{ .movs,  .np, .m32,  .m32,  .none, .none, &.{ 0xa5 }, 0, .none  },
@@ -476,6 +486,70 @@ pub const table = &[_]Entry{
 
     .{ .ret, .np, .none, .none, .none, .none, &.{ 0xc3 }, 0, .none },
 
+    .{ .rcl, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 2, .none },
+    .{ .rcl, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 2, .rex  },
+    .{ .rcl, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 2, .none },
+    .{ .rcl, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 2, .rex  },
+    .{ .rcl, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 2, .none },
+    .{ .rcl, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 2, .rex  },
+    .{ .rcl, .m1, .rm16, .unity, .none, .none, &.{ 0xd1 }, 2, .none },
+    .{ .rcl, .mc, .rm16, .cl,    .none, .none, &.{ 0xd3 }, 2, .none },
+    .{ .rcl, .mi, .rm16, .imm8,  .none, .none, &.{ 0xc1 }, 2, .none },
+    .{ .rcl, .m1, .rm32, .unity, .none, .none, &.{ 0xd1 }, 2, .none },
+    .{ .rcl, .m1, .rm64, .unity, .none, .none, &.{ 0xd1 }, 2, .long },
+    .{ .rcl, .mc, .rm32, .cl,    .none, .none, &.{ 0xd3 }, 2, .none },
+    .{ .rcl, .mc, .rm64, .cl,    .none, .none, &.{ 0xd3 }, 2, .long },
+    .{ .rcl, .mi, .rm32, .imm8,  .none, .none, &.{ 0xc1 }, 2, .none },
+    .{ .rcl, .mi, .rm64, .imm8,  .none, .none, &.{ 0xc1 }, 2, .long },
+
+    .{ .rcr, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 3, .none },
+    .{ .rcr, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 3, .rex  },
+    .{ .rcr, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 3, .none },
+    .{ .rcr, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 3, .rex  },
+    .{ .rcr, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 3, .none },
+    .{ .rcr, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 3, .rex  },
+    .{ .rcr, .m1, .rm16, .unity, .none, .none, &.{ 0xd1 }, 3, .none },
+    .{ .rcr, .mc, .rm16, .cl,    .none, .none, &.{ 0xd3 }, 3, .none },
+    .{ .rcr, .mi, .rm16, .imm8,  .none, .none, &.{ 0xc1 }, 3, .none },
+    .{ .rcr, .m1, .rm32, .unity, .none, .none, &.{ 0xd1 }, 3, .none },
+    .{ .rcr, .m1, .rm64, .unity, .none, .none, &.{ 0xd1 }, 3, .long },
+    .{ .rcr, .mc, .rm32, .cl,    .none, .none, &.{ 0xd3 }, 3, .none },
+    .{ .rcr, .mc, .rm64, .cl,    .none, .none, &.{ 0xd3 }, 3, .long },
+    .{ .rcr, .mi, .rm32, .imm8,  .none, .none, &.{ 0xc1 }, 3, .none },
+    .{ .rcr, .mi, .rm64, .imm8,  .none, .none, &.{ 0xc1 }, 3, .long },
+
+    .{ .rol, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 0, .none },
+    .{ .rol, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 0, .rex  },
+    .{ .rol, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 0, .none },
+    .{ .rol, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 0, .rex  },
+    .{ .rol, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 0, .none },
+    .{ .rol, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 0, .rex  },
+    .{ .rol, .m1, .rm16, .unity, .none, .none, &.{ 0xd1 }, 0, .none },
+    .{ .rol, .mc, .rm16, .cl,    .none, .none, &.{ 0xd3 }, 0, .none },
+    .{ .rol, .mi, .rm16, .imm8,  .none, .none, &.{ 0xc1 }, 0, .none },
+    .{ .rol, .m1, .rm32, .unity, .none, .none, &.{ 0xd1 }, 0, .none },
+    .{ .rol, .m1, .rm64, .unity, .none, .none, &.{ 0xd1 }, 0, .long },
+    .{ .rol, .mc, .rm32, .cl,    .none, .none, &.{ 0xd3 }, 0, .none },
+    .{ .rol, .mc, .rm64, .cl,    .none, .none, &.{ 0xd3 }, 0, .long },
+    .{ .rol, .mi, .rm32, .imm8,  .none, .none, &.{ 0xc1 }, 0, .none },
+    .{ .rol, .mi, .rm64, .imm8,  .none, .none, &.{ 0xc1 }, 0, .long },
+
+    .{ .ror, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 1, .none },
+    .{ .ror, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 1, .rex  },
+    .{ .ror, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 1, .none },
+    .{ .ror, .mc, .rm8,  .cl,    .none, .none, &.{ 0xd2 }, 1, .rex  },
+    .{ .ror, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 1, .none },
+    .{ .ror, .mi, .rm8,  .imm8,  .none, .none, &.{ 0xc0 }, 1, .rex  },
+    .{ .ror, .m1, .rm16, .unity, .none, .none, &.{ 0xd1 }, 1, .none },
+    .{ .ror, .mc, .rm16, .cl,    .none, .none, &.{ 0xd3 }, 1, .none },
+    .{ .ror, .mi, .rm16, .imm8,  .none, .none, &.{ 0xc1 }, 1, .none },
+    .{ .ror, .m1, .rm32, .unity, .none, .none, &.{ 0xd1 }, 1, .none },
+    .{ .ror, .m1, .rm64, .unity, .none, .none, &.{ 0xd1 }, 1, .long },
+    .{ .ror, .mc, .rm32, .cl,    .none, .none, &.{ 0xd3 }, 1, .none },
+    .{ .ror, .mc, .rm64, .cl,    .none, .none, &.{ 0xd3 }, 1, .long },
+    .{ .ror, .mi, .rm32, .imm8,  .none, .none, &.{ 0xc1 }, 1, .none },
+    .{ .ror, .mi, .rm64, .imm8,  .none, .none, &.{ 0xc1 }, 1, .long },
+
     .{ .sal, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 4, .none  },
     .{ .sal, .m1, .rm8,  .unity, .none, .none, &.{ 0xd0 }, 4, .rex   },
     .{ .sal, .m1, .rm16, .unity, .none, .none, &.{ 0xd1 }, 4, .none  },
src/arch/x86_64/Mir.zig
@@ -42,6 +42,8 @@ pub const Inst = struct {
         bsf,
         /// Bit scan reverse
         bsr,
+        /// Byte swap
+        bswap,
         /// Bit test
         bt,
         /// Bit test and complement
@@ -94,6 +96,8 @@ pub const Inst = struct {
         mfence,
         /// Move
         mov,
+        /// Move data after swapping bytes
+        movbe,
         /// Move with sign extension
         movsx,
         /// Move with zero extension
@@ -114,8 +118,16 @@ pub const Inst = struct {
         popcnt,
         /// Push
         push,
+        /// Rotate left through carry
+        rcl,
+        /// Rotate right through carry
+        rcr,
         /// Return
         ret,
+        /// Rotate left
+        rol,
+        /// Rotate right
+        ror,
         /// Arithmetic shift left
         sal,
         /// Arithmetic shift right