Commit 9d0ea0e3f1

Jakub Konka <kubkon@jakubkonka.com>
2022-11-29 15:25:06
arm: implement CPU feature detection by parsing system registers
Also add an incomplete table implementing instruction fusions according to official optimisation programming manuals.
1 parent 988fff2
Changed files (2)
lib
std
lib/std/zig/system/arm.zig
@@ -1,4 +1,5 @@
 const std = @import("std");
+const Target = std.Target;
 
 pub const CoreInfo = struct {
     architecture: u8 = 0,
@@ -9,14 +10,14 @@ pub const CoreInfo = struct {
 
 pub const cpu_models = struct {
     // Shorthands to simplify the tables below.
-    const A32 = std.Target.arm.cpu;
-    const A64 = std.Target.aarch64.cpu;
+    const A32 = Target.arm.cpu;
+    const A64 = Target.aarch64.cpu;
 
     const E = struct {
         part: u16,
         variant: ?u8 = null, // null if matches any variant
-        m32: ?*const std.Target.Cpu.Model = null,
-        m64: ?*const std.Target.Cpu.Model = null,
+        m32: ?*const Target.Cpu.Model = null,
+        m64: ?*const Target.Cpu.Model = null,
     };
 
     // implementer = 0x41
@@ -59,7 +60,6 @@ pub const cpu_models = struct {
         E{ .part = 0xd21, .m32 = &A32.cortex_m33, .m64 = null },
         E{ .part = 0xd41, .m32 = &A32.cortex_a78, .m64 = &A64.cortex_a78 },
         E{ .part = 0xd4b, .m32 = &A32.cortex_a78c, .m64 = &A64.cortex_a78c },
-        // This is a guess based on https://www.notebookcheck.net/Qualcomm-Snapdragon-8cx-Gen-3-Processor-Benchmarks-and-Specs.652916.0.html
         E{ .part = 0xd4c, .m32 = &A32.cortex_x1c, .m64 = &A64.cortex_x1c },
         E{ .part = 0xd44, .m32 = &A32.cortex_x1, .m64 = &A64.cortex_x1 },
         E{ .part = 0xd02, .m64 = &A64.cortex_a34 },
@@ -111,7 +111,7 @@ pub const cpu_models = struct {
         E{ .part = 0xc01, .m64 = &A64.saphira },
     };
 
-    pub fn isKnown(core: CoreInfo, is_64bit: bool) ?*const std.Target.Cpu.Model {
+    pub fn isKnown(core: CoreInfo, is_64bit: bool) ?*const Target.Cpu.Model {
         const models = switch (core.implementer) {
             0x41 => &ARM,
             0x42 => &Broadcom,
@@ -132,3 +132,174 @@ pub const cpu_models = struct {
         return null;
     }
 };
+
+pub const aarch64 = struct {
+    fn setFeature(cpu: *Target.Cpu, feature: Target.aarch64.Feature, enabled: bool) void {
+        const idx = @as(Target.Cpu.Feature.Set.Index, @enumToInt(feature));
+
+        if (enabled) cpu.features.addFeature(idx) else cpu.features.removeFeature(idx);
+    }
+
+    inline fn bitField(input: u64, offset: u6) u4 {
+        return @truncate(u4, input >> offset);
+    }
+
+    /// Input array should consist of readouts from 12 system registers such that:
+    /// 0  -> MIDR_EL1
+    /// 1  -> ID_AA64PFR0_EL1
+    /// 2  -> ID_AA64PFR1_EL1
+    /// 3  -> ID_AA64DFR0_EL1
+    /// 4  -> ID_AA64DFR1_EL1
+    /// 5  -> ID_AA64AFR0_EL1
+    /// 6  -> ID_AA64AFR1_EL1
+    /// 7  -> ID_AA64ISAR0_EL1
+    /// 8  -> ID_AA64ISAR1_EL1
+    /// 9  -> ID_AA64MMFR0_EL1
+    /// 10 -> ID_AA64MMFR1_EL1
+    /// 11 -> ID_AA64MMFR2_EL1
+    pub fn detectNativeCpuAndFeatures(arch: Target.Cpu.Arch, registers: [12]u64) ?Target.Cpu {
+        const info = detectNativeCoreInfo(registers[0]);
+        const model = cpu_models.isKnown(info, true) orelse return null;
+
+        var cpu = Target.Cpu{
+            .arch = arch,
+            .model = model,
+            .features = Target.Cpu.Feature.Set.empty,
+        };
+
+        detectNativeCpuFeatures(&cpu, registers[1..12]);
+        addInstructionFusions(&cpu, info);
+
+        return cpu;
+    }
+
+    /// Takes readout of MIDR_EL1 register as input.
+    fn detectNativeCoreInfo(midr: u64) CoreInfo {
+        var info = CoreInfo{
+            .implementer = @truncate(u8, midr >> 24),
+            .part = @truncate(u12, midr >> 4),
+        };
+
+        blk: {
+            if (info.implementer == 0x41) {
+                // ARM Ltd.
+                const special_bits = @truncate(u4, info.part >> 8);
+                if (special_bits == 0x0 or special_bits == 0x7) {
+                    // TODO Variant and arch encoded differently.
+                    break :blk;
+                }
+            }
+
+            info.variant |= @intCast(u8, @truncate(u4, midr >> 20)) << 4;
+            info.variant |= @truncate(u4, midr);
+            info.architecture = @truncate(u4, midr >> 16);
+        }
+
+        return info;
+    }
+
+    /// Input array should consist of readouts from 11 system registers such that:
+    /// 0  -> ID_AA64PFR0_EL1
+    /// 1  -> ID_AA64PFR1_EL1
+    /// 2  -> ID_AA64DFR0_EL1
+    /// 3  -> ID_AA64DFR1_EL1
+    /// 4  -> ID_AA64AFR0_EL1
+    /// 5  -> ID_AA64AFR1_EL1
+    /// 6  -> ID_AA64ISAR0_EL1
+    /// 7  -> ID_AA64ISAR1_EL1
+    /// 8  -> ID_AA64MMFR0_EL1
+    /// 9  -> ID_AA64MMFR1_EL1
+    /// 10 -> ID_AA64MMFR2_EL1
+    fn detectNativeCpuFeatures(cpu: *Target.Cpu, registers: *const [11]u64) void {
+        // ID_AA64PFR0_EL1
+        setFeature(cpu, .dit, bitField(registers[0], 48) >= 1);
+        setFeature(cpu, .am, bitField(registers[0], 44) >= 1);
+        setFeature(cpu, .amvs, bitField(registers[0], 44) >= 2);
+        setFeature(cpu, .mpam, bitField(registers[0], 40) >= 1); // MPAM v1.0
+        setFeature(cpu, .sel2, bitField(registers[0], 36) >= 1);
+        setFeature(cpu, .sve, bitField(registers[0], 32) >= 1);
+        setFeature(cpu, .el3, bitField(registers[0], 12) >= 1);
+        setFeature(cpu, .ras, bitField(registers[0], 28) >= 1);
+
+        if (bitField(registers[0], 20) < 0xF) blk: {
+            if (bitField(registers[0], 16) != bitField(registers[0], 20)) break :blk; // This should never occur
+
+            setFeature(cpu, .neon, true);
+            setFeature(cpu, .fp_armv8, true);
+            setFeature(cpu, .fullfp16, bitField(registers[0], 20) > 0);
+        }
+
+        // ID_AA64PFR1_EL1
+        setFeature(cpu, .mpam, bitField(registers[1], 16) > 0 and bitField(registers[0], 40) == 0); // MPAM v0.1
+        setFeature(cpu, .mte, bitField(registers[1], 8) >= 1);
+        setFeature(cpu, .ssbs, bitField(registers[1], 4) >= 1);
+        setFeature(cpu, .bti, bitField(registers[1], 0) >= 1);
+
+        // ID_AA64DFR0_EL1
+        setFeature(cpu, .tracev8_4, bitField(registers[2], 40) >= 1);
+        setFeature(cpu, .spe, bitField(registers[2], 32) >= 1);
+        setFeature(cpu, .perfmon, bitField(registers[2], 8) >= 1 and bitField(registers[2], 8) < 0xF);
+
+        // ID_AA64DFR1_EL1 reserved
+        // ID_AA64AFR0_EL1 reserved / implementation defined
+        // ID_AA64AFR1_EL1 reserved
+
+        // ID_AA64ISAR0_EL1
+        setFeature(cpu, .rand, bitField(registers[6], 60) >= 1);
+        setFeature(cpu, .tlb_rmi, bitField(registers[6], 56) >= 1);
+        setFeature(cpu, .flagm, bitField(registers[6], 52) >= 1);
+        setFeature(cpu, .fp16fml, bitField(registers[6], 48) >= 1);
+        setFeature(cpu, .dotprod, bitField(registers[6], 44) >= 1);
+        setFeature(cpu, .sm4, bitField(registers[6], 40) >= 1 and bitField(registers[6], 36) >= 1);
+        setFeature(cpu, .sha3, bitField(registers[6], 32) >= 1 and bitField(registers[6], 12) >= 2);
+        setFeature(cpu, .rdm, bitField(registers[6], 28) >= 1);
+        setFeature(cpu, .lse, bitField(registers[6], 20) >= 1);
+        setFeature(cpu, .crc, bitField(registers[6], 16) >= 1);
+        setFeature(cpu, .sha2, bitField(registers[6], 12) >= 1 and bitField(registers[6], 8) >= 1);
+        setFeature(cpu, .aes, bitField(registers[6], 4) >= 1);
+
+        // ID_AA64ISAR1_EL1
+        setFeature(cpu, .i8mm, bitField(registers[7], 52) >= 1);
+        setFeature(cpu, .bf16, bitField(registers[7], 44) >= 1);
+        setFeature(cpu, .predres, bitField(registers[7], 40) >= 1);
+        setFeature(cpu, .sb, bitField(registers[7], 36) >= 1);
+        setFeature(cpu, .fptoint, bitField(registers[7], 32) >= 1);
+        setFeature(cpu, .rcpc, bitField(registers[7], 20) >= 1);
+        setFeature(cpu, .rcpc_immo, bitField(registers[7], 20) >= 2);
+        setFeature(cpu, .complxnum, bitField(registers[7], 16) >= 1);
+        setFeature(cpu, .jsconv, bitField(registers[7], 12) >= 1);
+        setFeature(cpu, .pauth, bitField(registers[7], 8) >= 1 or bitField(registers[7], 4) >= 1);
+        setFeature(cpu, .ccpp, bitField(registers[7], 0) >= 1);
+        setFeature(cpu, .ccdp, bitField(registers[7], 0) >= 2);
+
+        // ID_AA64MMFR0_EL1
+        setFeature(cpu, .ecv, bitField(registers[8], 60) >= 1);
+        setFeature(cpu, .fgt, bitField(registers[8], 56) >= 1);
+
+        // ID_AA64MMFR1_EL1
+        setFeature(cpu, .pan, bitField(registers[9], 20) >= 1);
+        setFeature(cpu, .pan_rwv, bitField(registers[9], 20) >= 2);
+        setFeature(cpu, .lor, bitField(registers[9], 16) >= 1);
+        setFeature(cpu, .vh, bitField(registers[9], 8) >= 1);
+        setFeature(cpu, .contextidr_el2, bitField(registers[9], 8) >= 1);
+
+        // ID_AA64MMFR2_EL1
+        setFeature(cpu, .nv, bitField(registers[10], 24) >= 1);
+        setFeature(cpu, .ccidx, bitField(registers[10], 20) >= 1);
+        setFeature(cpu, .uaops, bitField(registers[10], 4) >= 1);
+    }
+
+    fn addInstructionFusions(cpu: *Target.Cpu, info: CoreInfo) void {
+        switch (info.implementer) {
+            0x41 => switch (info.part) {
+                0xd4b, 0xd4c => {
+                    // According to A78C/X1C Core Software Optimization Guide, CPU fuses certain instructions.
+                    setFeature(cpu, .cmp_bcc_fusion, true);
+                    setFeature(cpu, .fuse_aes, true);
+                },
+                else => {},
+            },
+            else => {},
+        }
+    }
+};
lib/std/zig/system/windows.zig
@@ -200,112 +200,6 @@ fn getCpuCount() usize {
     return std.os.windows.peb().NumberOfProcessors;
 }
 
-const ArmCpuInfoParser = struct {
-    cores: [4]CoreInfo = undefined,
-    core_no: usize = 0,
-    have_fields: usize = 0,
-
-    const CoreInfo = @import("arm.zig").CoreInfo;
-    const cpu_models = @import("arm.zig").cpu_models;
-
-    fn parseFeaturesFromRegisters(self: *ArmCpuInfoParser, registers: [12]u64) !void {
-        const info = &self.cores[self.core_no];
-        info.* = .{};
-
-        for (registers) |register| {
-            std.log.warn("{x}", .{register});
-        }
-
-        // // CPU part
-        // info.part = mem.readIntLittle(u16, data.cp_4000[0..2]) >> 4;
-        // self.have_fields += 1;
-
-        // // CPU implementer
-        // info.implementer = data.cp_4000[3];
-        // self.have_fields += 1;
-
-        // self.addOne();
-    }
-
-    fn addOne(self: *ArmCpuInfoParser) void {
-        if (self.have_fields == 3 and self.core_no < self.cores.len) {
-            if (self.core_no > 0) {
-                // Deduplicate the core info.
-                for (self.cores[0..self.core_no]) |it| {
-                    if (std.meta.eql(it, self.cores[self.core_no]))
-                        return;
-                }
-            }
-            self.core_no += 1;
-        }
-    }
-
-    fn finalize(self: ArmCpuInfoParser, arch: Target.Cpu.Arch) ?Target.Cpu {
-        if (self.core_no == 0) return null;
-
-        const is_64bit = switch (arch) {
-            .aarch64, .aarch64_be, .aarch64_32 => true,
-            else => false,
-        };
-
-        var known_models: [self.cores.len]?*const Target.Cpu.Model = undefined;
-        for (self.cores[0..self.core_no]) |core, i| {
-            known_models[i] = cpu_models.isKnown(core, is_64bit);
-        }
-
-        // XXX We pick the first core on big.LITTLE systems, hopefully the
-        // LITTLE one.
-        const model = known_models[0] orelse return null;
-        return Target.Cpu{
-            .arch = arch,
-            .model = model,
-            .features = model.features,
-        };
-    }
-
-    fn parse(arch: Target.Cpu.Arch) !?Target.Cpu {
-        var obj: ArmCpuInfoParser = .{};
-
-        // Backing datastore
-        var registers: [12]u64 = undefined;
-
-        var i: usize = 0;
-        while (i < getCpuCount()) : (i += 1) {
-            // Registry key to system ID register mapping
-            // CP 4000 -> MIDR_EL1
-            // CP 4020 -> ID_AA64PFR0_EL1
-            // CP 4021 -> ID_AA64PFR1_EL1
-            // CP 4028 -> ID_AA64DFR0_EL1
-            // CP 4029 -> ID_AA64DFR1_EL1
-            // CP 402C -> ID_AA64AFR0_EL1
-            // CP 402D -> ID_AA64AFR1_EL1
-            // CP 4030 -> ID_AA64ISAR0_EL1
-            // CP 4031 -> ID_AA64ISAR1_EL1
-            // CP 4038 -> ID_AA64MMFR0_EL1
-            // CP 4039 -> ID_AA64MMFR1_EL1
-            // CP 403A -> ID_AA64MMFR2_EL1
-            try getCpuInfoFromRegistry(i, .{
-                .{ .key = "CP 4000", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[0]) },
-                .{ .key = "CP 4020", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[1]) },
-                .{ .key = "CP 4021", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[2]) },
-                .{ .key = "CP 4028", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[3]) },
-                .{ .key = "CP 4029", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[4]) },
-                .{ .key = "CP 402C", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[5]) },
-                .{ .key = "CP 402D", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[6]) },
-                .{ .key = "CP 4030", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[7]) },
-                .{ .key = "CP 4031", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[8]) },
-                .{ .key = "CP 4038", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[9]) },
-                .{ .key = "CP 4039", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[10]) },
-                .{ .key = "CP 403A", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[11]) },
-            });
-
-            try obj.parseFeaturesFromRegisters(registers);
-        }
-
-        return obj.finalize(arch);
-    }
-};
-
 /// If the fine-grained detection of CPU features via Win registry fails,
 /// we fallback to a generic CPU model but we override the feature set
 /// using `SharedUserData` contents.
@@ -338,7 +232,52 @@ fn genericCpuAndNativeFeatures(arch: Target.Cpu.Arch) Target.Cpu {
 pub fn detectNativeCpuAndFeatures() ?Target.Cpu {
     const current_arch = builtin.cpu.arch;
     const cpu: ?Target.Cpu = switch (current_arch) {
-        .aarch64, .aarch64_be, .aarch64_32 => ArmCpuInfoParser.parse(current_arch) catch null,
+        .aarch64, .aarch64_be, .aarch64_32 => blk: {
+            var cores: [128]Target.Cpu = undefined;
+            const core_count = getCpuCount();
+
+            if (core_count > cores.len) break :blk null;
+
+            var i: usize = 0;
+            while (i < core_count) : (i += 1) {
+                // Backing datastore
+                var registers: [12]u64 = undefined;
+
+                // Registry key to system ID register mapping
+                // CP 4000 -> MIDR_EL1
+                // CP 4020 -> ID_AA64PFR0_EL1
+                // CP 4021 -> ID_AA64PFR1_EL1
+                // CP 4028 -> ID_AA64DFR0_EL1
+                // CP 4029 -> ID_AA64DFR1_EL1
+                // CP 402C -> ID_AA64AFR0_EL1
+                // CP 402D -> ID_AA64AFR1_EL1
+                // CP 4030 -> ID_AA64ISAR0_EL1
+                // CP 4031 -> ID_AA64ISAR1_EL1
+                // CP 4038 -> ID_AA64MMFR0_EL1
+                // CP 4039 -> ID_AA64MMFR1_EL1
+                // CP 403A -> ID_AA64MMFR2_EL1
+                getCpuInfoFromRegistry(i, .{
+                    .{ .key = "CP 4000", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[0]) },
+                    .{ .key = "CP 4020", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[1]) },
+                    .{ .key = "CP 4021", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[2]) },
+                    .{ .key = "CP 4028", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[3]) },
+                    .{ .key = "CP 4029", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[4]) },
+                    .{ .key = "CP 402C", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[5]) },
+                    .{ .key = "CP 402D", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[6]) },
+                    .{ .key = "CP 4030", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[7]) },
+                    .{ .key = "CP 4031", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[8]) },
+                    .{ .key = "CP 4038", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[9]) },
+                    .{ .key = "CP 4039", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[10]) },
+                    .{ .key = "CP 403A", .value_type = REG.QWORD, .value_buf = @ptrCast(*[8]u8, &registers[11]) },
+                }) catch break :blk null;
+
+                cores[i] = @import("arm.zig").aarch64.detectNativeCpuAndFeatures(current_arch, registers) orelse
+                    break :blk null;
+            }
+
+            // Pick the first core, usually LITTLE in big.LITTLE architecture.
+            break :blk cores[0];
+        },
         else => null,
     };
     return cpu orelse genericCpuAndNativeFeatures(current_arch);