Commit 53ec2a955e

Jacob Young <jacobly0@users.noreply.github.com>
2023-03-18 13:02:17
x86_64: implement clz, ctz, and popCount
1 parent edd63f9
src/arch/x86_64/CodeGen.zig
@@ -2597,28 +2597,143 @@ fn airGetUnionTag(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airClz(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst))
-        .dead
-    else
-        return self.fail("TODO implement airClz for {}", .{self.target.cpu.arch});
+    const result = result: {
+        if (self.liveness.isUnused(inst)) break :result .dead;
+
+        const dst_ty = self.air.typeOfIndex(inst);
+        const src_ty = self.air.typeOf(ty_op.operand);
+        const src_bits = src_ty.bitSize(self.target.*);
+
+        const src_mcv = try self.resolveInst(ty_op.operand);
+        const mat_src_mcv = switch (src_mcv) {
+            .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
+            else => src_mcv,
+        };
+        const mat_src_lock = switch (mat_src_mcv) {
+            .register => |reg| self.register_manager.lockReg(reg),
+            else => null,
+        };
+        defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const dst_reg = try self.register_manager.allocReg(inst, gp);
+        const dst_mcv = MCValue{ .register = dst_reg };
+        const dst_lock = self.register_manager.lockReg(dst_reg);
+        defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+        if (Target.x86.featureSetHas(self.target.cpu.features, .lzcnt)) {
+            try self.genBinOpMir(.lzcnt, src_ty, dst_mcv, mat_src_mcv);
+            const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*));
+            const extra_bits = registerAlias(dst_reg, src_abi_size).bitSize() - src_bits;
+            if (extra_bits > 0) {
+                try self.genBinOpMir(.sub, dst_ty, dst_mcv, .{ .immediate = extra_bits });
+            }
+            break :result dst_mcv;
+        }
+
+        const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits });
+        const width_mcv = MCValue{ .register = width_reg };
+        try self.genBinOpMir(.bsr, src_ty, dst_mcv, mat_src_mcv);
+
+        const dst_abi_size = @intCast(u32, @max(dst_ty.abiSize(self.target.*), 2));
+        try self.asmCmovccRegisterRegister(
+            registerAlias(dst_reg, dst_abi_size),
+            registerAlias(width_reg, dst_abi_size),
+            .z,
+        );
+
+        try self.genBinOpMir(.sub, dst_ty, width_mcv, dst_mcv);
+        break :result width_mcv;
+    };
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
 fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst))
-        .dead
-    else
-        return self.fail("TODO implement airCtz for {}", .{self.target.cpu.arch});
+    const result = result: {
+        if (self.liveness.isUnused(inst)) break :result .dead;
+
+        const dst_ty = self.air.typeOfIndex(inst);
+        const src_ty = self.air.typeOf(ty_op.operand);
+        const src_bits = src_ty.bitSize(self.target.*);
+
+        const src_mcv = try self.resolveInst(ty_op.operand);
+        const mat_src_mcv = switch (src_mcv) {
+            .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
+            else => src_mcv,
+        };
+        const mat_src_lock = switch (mat_src_mcv) {
+            .register => |reg| self.register_manager.lockReg(reg),
+            else => null,
+        };
+        defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock);
+
+        const dst_reg = try self.register_manager.allocReg(inst, gp);
+        const dst_mcv = MCValue{ .register = dst_reg };
+        const dst_lock = self.register_manager.lockReg(dst_reg);
+        defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+        if (Target.x86.featureSetHas(self.target.cpu.features, .bmi)) {
+            const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*));
+            const extra_bits = registerAlias(dst_reg, src_abi_size).bitSize() - src_bits;
+            const masked_mcv = if (extra_bits > 0) masked: {
+                const mask_mcv = MCValue{
+                    .immediate = ((@as(u64, 1) << @intCast(u6, extra_bits)) - 1) << @intCast(u6, src_bits),
+                };
+                const tmp_mcv = tmp: {
+                    if (src_mcv.isImmediate() or self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) {
+                        break :tmp src_mcv;
+                    }
+                    try self.genSetReg(src_ty, dst_reg, src_mcv);
+                    break :tmp dst_mcv;
+                };
+                try self.genBinOpMir(.@"or", src_ty, tmp_mcv, mask_mcv);
+                break :masked tmp_mcv;
+            } else mat_src_mcv;
+            try self.genBinOpMir(.tzcnt, src_ty, dst_mcv, masked_mcv);
+            break :result dst_mcv;
+        }
+
+        const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits });
+        try self.genBinOpMir(.bsf, src_ty, dst_mcv, mat_src_mcv);
+
+        const abi_size = @max(@intCast(u32, dst_ty.abiSize(self.target.*)), 2);
+        try self.asmCmovccRegisterRegister(
+            registerAlias(dst_reg, abi_size),
+            registerAlias(width_reg, abi_size),
+            .z,
+        );
+
+        break :result dst_mcv;
+    };
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
 fn airPopcount(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst))
-        .dead
-    else
-        return self.fail("TODO implement airPopcount for {}", .{self.target.cpu.arch});
+    const result = result: {
+        if (self.liveness.isUnused(inst)) break :result .dead;
+
+        const op_ty = self.air.typeOf(ty_op.operand);
+
+        if (Target.x86.featureSetHas(self.target.cpu.features, .popcnt)) {
+            const op_mcv = try self.resolveInst(ty_op.operand);
+            const mat_op_mcv = switch (op_mcv) {
+                .immediate => MCValue{ .register = try self.copyToTmpRegister(op_ty, op_mcv) },
+                else => op_mcv,
+            };
+            const mat_op_lock = switch (mat_op_mcv) {
+                .register => |reg| self.register_manager.lockReg(reg),
+                else => null,
+            };
+            defer if (mat_op_lock) |lock| self.register_manager.unlockReg(lock);
+
+            const dst_mcv = MCValue{ .register = try self.register_manager.allocReg(inst, gp) };
+            try self.genBinOpMir(.popcnt, op_ty, dst_mcv, mat_op_mcv);
+            break :result dst_mcv;
+        }
+
+        return self.fail("TODO implement airPopcount for {}", .{op_ty.fmt(self.bin_file.options.module.?)});
+    };
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
@@ -3491,7 +3606,7 @@ fn genBinOp(
             if (lhs.isRegister() and self.reuseOperand(inst, lhs_air, 0, lhs)) {
                 break :blk lhs;
             }
-            if (rhs.isRegister() and is_commutative and self.reuseOperand(inst, rhs_air, 1, rhs)) {
+            if (is_commutative and rhs.isRegister() and self.reuseOperand(inst, rhs_air, 1, rhs)) {
                 flipped = true;
                 break :blk rhs;
             }
src/arch/x86_64/Emit.zig
@@ -73,6 +73,8 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .adc,
             .add,
             .@"and",
+            .bsf,
+            .bsr,
             .call,
             .cbw,
             .cwde,
@@ -89,12 +91,14 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .int3,
             .jmp,
             .lea,
+            .lzcnt,
             .mov,
             .movzx,
             .mul,
             .nop,
             .@"or",
             .pop,
+            .popcnt,
             .push,
             .ret,
             .sal,
@@ -105,6 +109,7 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
             .sub,
             .syscall,
             .@"test",
+            .tzcnt,
             .ud2,
             .xor,
 
src/arch/x86_64/Encoding.zig
@@ -307,6 +307,7 @@ pub const Mnemonic = enum {
     // zig fmt: off
     // General-purpose
     adc, add, @"and",
+    bsf, bsr,
     call, cbw, cdq, cdqe,
     cmova, cmovae, cmovb, cmovbe, cmovc, cmove, cmovg, cmovge, cmovl, cmovle, cmovna,
     cmovnae, cmovnb, cmovnbe, cmovnc, cmovne, cmovng, cmovnge, cmovnl, cmovnle, cmovno,
@@ -322,12 +323,13 @@ pub const Mnemonic = enum {
     jmp, 
     lea,
     lods, lodsb, lodsd, lodsq, lodsw,
+    lzcnt,
     mov,
     movs, movsb, movsd, movsq, movsw,
     movsx, movsxd, movzx, mul,
     nop,
     @"or",
-    pop, push,
+    pop, popcnt, push,
     ret,
     sal, sar, sbb,
     scas, scasb, scasd, scasq, scasw,
@@ -336,7 +338,7 @@ pub const Mnemonic = enum {
     setnb, setnbe, setnc, setne, setng, setnge, setnl, setnle, setno, setnp, setns,
     setnz, seto, setp, setpe, setpo, sets, setz,
     stos, stosb, stosd, stosq, stosw,
-    @"test",
+    @"test", tzcnt,
     ud2,
     xor,
     // SSE
src/arch/x86_64/encodings.zig
@@ -81,6 +81,14 @@ pub const table = &[_]Entry{
     .{ .@"and", .rm, .r32,  .rm32,   .none, .none, &.{ 0x23 }, 0, .none  },
     .{ .@"and", .rm, .r64,  .rm64,   .none, .none, &.{ 0x23 }, 0, .long  },
 
+    .{ .bsf, .rm, .r16, .rm16, .none, .none, &.{ 0x0f, 0xbc }, 0, .none },
+    .{ .bsf, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbc }, 0, .none },
+    .{ .bsf, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbc }, 0, .long },
+
+    .{ .bsr, .rm, .r16, .rm16, .none, .none, &.{ 0x0f, 0xbd }, 0, .none },
+    .{ .bsr, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbd }, 0, .none },
+    .{ .bsr, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbd }, 0, .long },
+
     // This is M encoding according to Intel, but D makes more sense here.
     .{ .call, .d, .rel32, .none, .none, .none, &.{ 0xe8 }, 0, .none },
     .{ .call, .m, .rm64,  .none, .none, .none, &.{ 0xff }, 2, .none },
@@ -301,6 +309,10 @@ pub const table = &[_]Entry{
     .{ .lodsd, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .none  },
     .{ .lodsq, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .long  },
 
+    .{ .lzcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .none },
+    .{ .lzcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .none },
+    .{ .lzcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .long },
+
     .{ .mov, .mr, .rm8,   .r8,     .none, .none, &.{ 0x88 }, 0, .none  },
     .{ .mov, .mr, .rm8,   .r8,     .none, .none, &.{ 0x88 }, 0, .rex   },
     .{ .mov, .mr, .rm16,  .r16,    .none, .none, &.{ 0x89 }, 0, .none  },
@@ -397,6 +409,10 @@ pub const table = &[_]Entry{
     .{ .pop, .m, .rm16, .none, .none, .none, &.{ 0x8f }, 0, .none  },
     .{ .pop, .m, .rm64, .none, .none, .none, &.{ 0x8f }, 0, .none  },
 
+    .{ .popcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none },
+    .{ .popcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none },
+    .{ .popcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long },
+
     .{ .push, .o, .r16,   .none, .none, .none, &.{ 0x50 }, 0, .none  },
     .{ .push, .o, .r64,   .none, .none, .none, &.{ 0x50 }, 0, .none  },
     .{ .push, .m, .rm16,  .none, .none, .none, &.{ 0xff }, 6, .none  },
@@ -596,8 +612,8 @@ pub const table = &[_]Entry{
     .{ .sub, .rm, .r32,  .rm32,   .none, .none, &.{ 0x2b }, 0, .none  },
     .{ .sub, .rm, .r64,  .rm64,   .none, .none, &.{ 0x2b }, 0, .long  },
 
-    .{ .syscall, .np, .none, .none, .none, .none, &.{ 0x0f, 0x05 }, 0, .none },
-
+    .{ .syscall, .np, .none, .none, .none, .none, &.{ 0x0f, 0x05 }, 0, .none }
+,
     .{ .@"test", .zi, .al,   .imm8,   .none, .none, &.{ 0xa8 }, 0, .none  },
     .{ .@"test", .zi, .ax,   .imm16,  .none, .none, &.{ 0xa9 }, 0, .none  },
     .{ .@"test", .zi, .eax,  .imm32,  .none, .none, &.{ 0xa9 }, 0, .none  },
@@ -613,6 +629,10 @@ pub const table = &[_]Entry{
     .{ .@"test", .mr, .rm32, .r32,    .none, .none, &.{ 0x85 }, 0, .none  },
     .{ .@"test", .mr, .rm64, .r64,    .none, .none, &.{ 0x85 }, 0, .long  },
 
+    .{ .tzcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .none },
+    .{ .tzcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .none },
+    .{ .tzcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .long },
+
     .{ .ud2, .np, .none, .none, .none, .none, &.{ 0x0f, 0x0b }, 0, .none  },
 
     .{ .xor, .zi, .al,   .imm8,   .none, .none, &.{ 0x34 }, 0, .none  },
src/arch/x86_64/Mir.zig
@@ -38,6 +38,10 @@ pub const Inst = struct {
         add,
         /// Logical and
         @"and",
+        /// Bit scan forward
+        bsf,
+        /// Bit scan reverse
+        bsr,
         /// Call
         call,
         /// Convert byte to word
@@ -70,6 +74,8 @@ pub const Inst = struct {
         jmp,
         /// Load effective address
         lea,
+        /// Count the number of leading zero bits
+        lzcnt,
         /// Move
         mov,
         /// Move with sign extension
@@ -84,6 +90,8 @@ pub const Inst = struct {
         @"or",
         /// Pop
         pop,
+        /// Return the count of number of bits set to 1
+        popcnt,
         /// Push
         push,
         /// Return
@@ -104,6 +112,8 @@ pub const Inst = struct {
         syscall,
         /// Test condition
         @"test",
+        /// Count the number of trailing zero bits
+        tzcnt,
         /// Undefined instruction
         ud2,
         /// Logical exclusive-or