master
  1//! To get started, run this tool with no args and read the help message.
  2//!
  3//! The build systems of glibc, musl, FreeBSD, and NetBSD require specifying a single target
  4//! architecture. Meanwhile, Zig supports out-of-the-box cross compilation for
  5//! every target. So the process to create libc headers that Zig ships is to use
  6//! this tool.
  7//!
  8//! First, use the glibc, musl, FreeBSD, and NetBSD build systems to create installations of all the
  9//! targets in the `glibc_targets`, `musl_targets`, `freebsd_targets`, and `netbsd_targets`
 10//! variables. Next, run this tool to create a new directory which puts .h files into
 11//! <arch> subdirectories, with `generic` being files that apply to all architectures.
 12//! You'll then have to manually update Zig source repo with these new files.
 13
 14const std = @import("std");
 15const Arch = std.Target.Cpu.Arch;
 16const Abi = std.Target.Abi;
 17const OsTag = std.Target.Os.Tag;
 18const assert = std.debug.assert;
 19const Blake3 = std.crypto.hash.Blake3;
 20
 21const LibCTarget = struct {
 22    arch: Arch,
 23    abi: Abi,
 24    dest: ?[]const u8 = null,
 25};
 26
 27const glibc_targets = [_]LibCTarget{
 28    .{ .arch = .arc, .abi = .gnu },
 29    .{ .arch = .arm, .abi = .gnueabi, .dest = "arm-linux-gnu" },
 30    .{ .arch = .arm, .abi = .gnueabihf, .dest = "arm-linux-gnu" },
 31    .{ .arch = .armeb, .abi = .gnueabi, .dest = "arm-linux-gnu" },
 32    .{ .arch = .armeb, .abi = .gnueabihf, .dest = "arm-linux-gnu" },
 33    .{ .arch = .aarch64, .abi = .gnu, .dest = "aarch64-linux-gnu" },
 34    .{ .arch = .aarch64_be, .abi = .gnu, .dest = "aarch64-linux-gnu" },
 35    .{ .arch = .csky, .abi = .gnueabi, .dest = "csky-linux-gnu" },
 36    .{ .arch = .csky, .abi = .gnueabihf, .dest = "csky-linux-gnu" },
 37    .{ .arch = .loongarch64, .abi = .gnu, .dest = "loongarch-linux-gnu" },
 38    .{ .arch = .loongarch64, .abi = .gnusf, .dest = "loongarch-linux-gnu" },
 39    .{ .arch = .m68k, .abi = .gnu },
 40    .{ .arch = .mips, .abi = .gnueabi, .dest = "mips-linux-gnu" },
 41    .{ .arch = .mips, .abi = .gnueabihf, .dest = "mips-linux-gnu" },
 42    .{ .arch = .mipsel, .abi = .gnueabi, .dest = "mips-linux-gnu" },
 43    .{ .arch = .mipsel, .abi = .gnueabihf, .dest = "mips-linux-gnu" },
 44    .{ .arch = .mips64, .abi = .gnuabi64, .dest = "mips-linux-gnu" },
 45    .{ .arch = .mips64, .abi = .gnuabin32, .dest = "mips-linux-gnu" },
 46    .{ .arch = .mips64el, .abi = .gnuabi64, .dest = "mips-linux-gnu" },
 47    .{ .arch = .mips64el, .abi = .gnuabin32, .dest = "mips-linux-gnu" },
 48    .{ .arch = .powerpc, .abi = .gnueabi, .dest = "powerpc-linux-gnu" },
 49    .{ .arch = .powerpc, .abi = .gnueabihf, .dest = "powerpc-linux-gnu" },
 50    .{ .arch = .powerpc64, .abi = .gnu, .dest = "powerpc-linux-gnu" },
 51    .{ .arch = .powerpc64le, .abi = .gnu, .dest = "powerpc-linux-gnu" },
 52    .{ .arch = .riscv32, .abi = .gnu, .dest = "riscv-linux-gnu" },
 53    .{ .arch = .riscv64, .abi = .gnu, .dest = "riscv-linux-gnu" },
 54    .{ .arch = .s390x, .abi = .gnu },
 55    .{ .arch = .sparc, .abi = .gnu, .dest = "sparc-linux-gnu" },
 56    .{ .arch = .sparc64, .abi = .gnu, .dest = "sparc-linux-gnu" },
 57    .{ .arch = .x86, .abi = .gnu, .dest = "x86-linux-gnu" },
 58    .{ .arch = .x86_64, .abi = .gnu, .dest = "x86-linux-gnu" },
 59    .{ .arch = .x86_64, .abi = .gnux32, .dest = "x86-linux-gnu" },
 60};
 61
 62const musl_targets = [_]LibCTarget{
 63    .{ .arch = .arm, .abi = .musl },
 64    .{ .arch = .aarch64, .abi = .musl },
 65    .{ .arch = .hexagon, .abi = .musl },
 66    .{ .arch = .loongarch64, .abi = .musl },
 67    .{ .arch = .m68k, .abi = .musl },
 68    .{ .arch = .mips, .abi = .musl },
 69    .{ .arch = .mips64, .abi = .musl },
 70    .{ .arch = .mips64, .abi = .muslabin32 },
 71    .{ .arch = .powerpc, .abi = .musl },
 72    .{ .arch = .powerpc64, .abi = .musl },
 73    .{ .arch = .riscv32, .abi = .musl },
 74    .{ .arch = .riscv64, .abi = .musl },
 75    .{ .arch = .s390x, .abi = .musl },
 76    .{ .arch = .x86, .abi = .musl },
 77    .{ .arch = .x86_64, .abi = .musl },
 78    .{ .arch = .x86_64, .abi = .muslx32 },
 79};
 80
 81const freebsd_targets = [_]LibCTarget{
 82    .{ .arch = .arm, .abi = .eabihf },
 83    .{ .arch = .aarch64, .abi = .none },
 84    .{ .arch = .powerpc, .abi = .eabihf },
 85    .{ .arch = .powerpc64, .abi = .none },
 86    .{ .arch = .riscv64, .abi = .none },
 87    .{ .arch = .x86, .abi = .none },
 88    .{ .arch = .x86_64, .abi = .none },
 89};
 90
 91const netbsd_targets = [_]LibCTarget{
 92    .{ .arch = .arm, .abi = .eabi, .dest = "arm-netbsd-eabi" },
 93    .{ .arch = .arm, .abi = .eabihf, .dest = "arm-netbsd-eabi" },
 94    .{ .arch = .aarch64, .abi = .none },
 95    .{ .arch = .m68k, .abi = .none },
 96    .{ .arch = .mips, .abi = .eabi, .dest = "mips-netbsd-eabi" },
 97    .{ .arch = .mips, .abi = .eabihf, .dest = "mips-netbsd-eabi" },
 98    .{ .arch = .powerpc, .abi = .eabi, .dest = "powerpc-netbsd-eabi" },
 99    .{ .arch = .powerpc, .abi = .eabihf, .dest = "powerpc-netbsd-eabi" },
100    .{ .arch = .sparc, .abi = .none },
101    .{ .arch = .sparc64, .abi = .none },
102    .{ .arch = .x86, .abi = .none },
103    .{ .arch = .x86_64, .abi = .none },
104};
105
106const Contents = struct {
107    bytes: []const u8,
108    hit_count: usize,
109    hash: []const u8,
110    is_generic: bool,
111
112    fn hitCountLessThan(context: void, lhs: *const Contents, rhs: *const Contents) bool {
113        _ = context;
114        return lhs.hit_count < rhs.hit_count;
115    }
116};
117
118const HashToContents = std.StringHashMap(Contents);
119const TargetToHash = std.StringArrayHashMap([]const u8);
120const PathTable = std.StringHashMap(*TargetToHash);
121
122const LibCVendor = enum {
123    musl,
124    glibc,
125    freebsd,
126    netbsd,
127};
128
129pub fn main() !void {
130    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
131    const allocator = arena.allocator();
132    const args = try std.process.argsAlloc(allocator);
133    var search_paths = std.array_list.Managed([]const u8).init(allocator);
134    var opt_out_dir: ?[]const u8 = null;
135    var opt_abi: ?[]const u8 = null;
136
137    var arg_i: usize = 1;
138    while (arg_i < args.len) : (arg_i += 1) {
139        if (std.mem.eql(u8, args[arg_i], "--help"))
140            usageAndExit(args[0]);
141        if (arg_i + 1 >= args.len) {
142            std.debug.print("expected argument after '{s}'\n", .{args[arg_i]});
143            usageAndExit(args[0]);
144        }
145
146        if (std.mem.eql(u8, args[arg_i], "--search-path")) {
147            try search_paths.append(args[arg_i + 1]);
148        } else if (std.mem.eql(u8, args[arg_i], "--out")) {
149            assert(opt_out_dir == null);
150            opt_out_dir = args[arg_i + 1];
151        } else if (std.mem.eql(u8, args[arg_i], "--abi")) {
152            assert(opt_abi == null);
153            opt_abi = args[arg_i + 1];
154        } else {
155            std.debug.print("unrecognized argument: {s}\n", .{args[arg_i]});
156            usageAndExit(args[0]);
157        }
158
159        arg_i += 1;
160    }
161
162    const out_dir = opt_out_dir orelse usageAndExit(args[0]);
163    const abi_name = opt_abi orelse usageAndExit(args[0]);
164    const vendor = std.meta.stringToEnum(LibCVendor, abi_name) orelse {
165        std.debug.print("unrecognized C ABI: {s}\n", .{abi_name});
166        usageAndExit(args[0]);
167    };
168
169    const generic_name = try std.fmt.allocPrint(allocator, "generic-{s}", .{abi_name});
170    const libc_targets = switch (vendor) {
171        .glibc => &glibc_targets,
172        .musl => &musl_targets,
173        .freebsd => &freebsd_targets,
174        .netbsd => &netbsd_targets,
175    };
176
177    var path_table = PathTable.init(allocator);
178    var hash_to_contents = HashToContents.init(allocator);
179    var max_bytes_saved: usize = 0;
180    var total_bytes: usize = 0;
181
182    var hasher = Blake3.init(.{});
183
184    for (libc_targets) |libc_target| {
185        const libc_dir = switch (vendor) {
186            .glibc => try std.zig.target.glibcRuntimeTriple(allocator, libc_target.arch, .linux, libc_target.abi),
187            .musl => std.zig.target.muslArchName(libc_target.arch, libc_target.abi),
188            .freebsd => switch (libc_target.arch) {
189                .arm => "armv7",
190                .x86 => "i386",
191                .x86_64 => "amd64",
192
193                .aarch64,
194                .powerpc,
195                .powerpc64,
196                .riscv64,
197                => |a| @tagName(a),
198
199                else => unreachable,
200            },
201            .netbsd => switch (libc_target.arch) {
202                .arm => if (libc_target.abi == .eabihf) "evbarmv7hf" else "evbarmv7",
203                .aarch64 => "evbarm64",
204                .m68k => "mac68k",
205                .mips => if (libc_target.abi == .eabihf) "evbmips" else "evbmipssf",
206                .powerpc => if (libc_target.abi == .eabihf) "evbppc" else "evbppcsf",
207                .x86 => "i386",
208                .x86_64 => "amd64",
209
210                .sparc,
211                .sparc64,
212                => |a| @tagName(a),
213
214                else => unreachable,
215            },
216        };
217
218        const dest_target = if (libc_target.dest) |dest| dest else try std.fmt.allocPrint(allocator, "{s}-{s}-{s}", .{
219            @tagName(libc_target.arch),
220            switch (vendor) {
221                .musl, .glibc => "linux",
222                .freebsd => "freebsd",
223                .netbsd => "netbsd",
224            },
225            @tagName(libc_target.abi),
226        });
227
228        search: for (search_paths.items) |search_path| {
229            const sub_path = switch (vendor) {
230                .glibc,
231                .freebsd,
232                .netbsd,
233                => &[_][]const u8{ search_path, libc_dir, "usr", "include" },
234                .musl => &[_][]const u8{ search_path, libc_dir, "usr", "local", "musl", "include" },
235            };
236            const target_include_dir = try std.fs.path.join(allocator, sub_path);
237            var dir_stack = std.array_list.Managed([]const u8).init(allocator);
238            try dir_stack.append(target_include_dir);
239
240            while (dir_stack.pop()) |full_dir_name| {
241                var dir = std.fs.cwd().openDir(full_dir_name, .{ .iterate = true }) catch |err| switch (err) {
242                    error.FileNotFound => continue :search,
243                    error.AccessDenied => continue :search,
244                    else => return err,
245                };
246                defer dir.close();
247
248                var dir_it = dir.iterate();
249
250                while (try dir_it.next()) |entry| {
251                    const full_path = try std.fs.path.join(allocator, &[_][]const u8{ full_dir_name, entry.name });
252                    switch (entry.kind) {
253                        .directory => try dir_stack.append(full_path),
254                        .file, .sym_link => {
255                            const rel_path = try std.fs.path.relative(allocator, target_include_dir, full_path);
256                            const max_size = 2 * 1024 * 1024 * 1024;
257                            const raw_bytes = try std.fs.cwd().readFileAlloc(full_path, allocator, .limited(max_size));
258                            const trimmed = std.mem.trim(u8, raw_bytes, " \r\n\t");
259                            total_bytes += raw_bytes.len;
260                            const hash = try allocator.alloc(u8, 32);
261                            hasher = Blake3.init(.{});
262                            hasher.update(rel_path);
263                            hasher.update(trimmed);
264                            hasher.final(hash);
265                            const gop = try hash_to_contents.getOrPut(hash);
266                            if (gop.found_existing) {
267                                max_bytes_saved += raw_bytes.len;
268                                gop.value_ptr.hit_count += 1;
269                                std.debug.print("duplicate: {s} {s} ({B})\n", .{
270                                    libc_dir,
271                                    rel_path,
272                                    raw_bytes.len,
273                                });
274                            } else {
275                                gop.value_ptr.* = Contents{
276                                    .bytes = trimmed,
277                                    .hit_count = 1,
278                                    .hash = hash,
279                                    .is_generic = false,
280                                };
281                            }
282                            const path_gop = try path_table.getOrPut(rel_path);
283                            const target_to_hash = if (path_gop.found_existing) path_gop.value_ptr.* else blk: {
284                                const ptr = try allocator.create(TargetToHash);
285                                ptr.* = TargetToHash.init(allocator);
286                                path_gop.value_ptr.* = ptr;
287                                break :blk ptr;
288                            };
289                            // When `dest` is set, there are a few rare cases where we expect to overwrite a header. For
290                            // example, `bits/long-double.h` differs very slightly between `powerpc64le-linux-gnu` and
291                            // other `powerpc*-linux-gnu` targets, and we unify those targets as `powerpc-linux-gnu`. In
292                            // such cases, we manually patch the affected header after processing, so it's fine that
293                            // only one header wins here.
294                            if (libc_target.dest != null) {
295                                const hash_gop = try target_to_hash.getOrPut(dest_target);
296                                if (hash_gop.found_existing) std.debug.print("overwrote: {s} {s} {s}\n", .{
297                                    libc_dir,
298                                    rel_path,
299                                    dest_target,
300                                }) else hash_gop.value_ptr.* = hash;
301                            } else {
302                                try target_to_hash.putNoClobber(dest_target, hash);
303                            }
304                        },
305                        else => std.debug.print("warning: weird file: {s}\n", .{full_path}),
306                    }
307                }
308            }
309            break;
310        } else {
311            std.debug.print("warning: libc target not found: {s}\n", .{libc_dir});
312        }
313    }
314    std.debug.print("summary: {B} could be reduced to {B}\n", .{
315        total_bytes,
316        total_bytes - max_bytes_saved,
317    });
318    try std.fs.cwd().makePath(out_dir);
319
320    var missed_opportunity_bytes: usize = 0;
321    // iterate path_table. for each path, put all the hashes into a list. sort by hit_count.
322    // the hash with the highest hit_count gets to be the "generic" one. everybody else
323    // gets their header in a separate arch directory.
324    var path_it = path_table.iterator();
325    while (path_it.next()) |path_kv| {
326        var contents_list = std.array_list.Managed(*Contents).init(allocator);
327        {
328            var hash_it = path_kv.value_ptr.*.iterator();
329            while (hash_it.next()) |hash_kv| {
330                const contents = hash_to_contents.getPtr(hash_kv.value_ptr.*).?;
331                try contents_list.append(contents);
332            }
333        }
334        std.mem.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan);
335        const best_contents = contents_list.pop().?;
336        if (best_contents.hit_count > 1) {
337            // worth it to make it generic
338            const full_path = try std.fs.path.join(allocator, &[_][]const u8{ out_dir, generic_name, path_kv.key_ptr.* });
339            try std.fs.cwd().makePath(std.fs.path.dirname(full_path).?);
340            try std.fs.cwd().writeFile(.{ .sub_path = full_path, .data = best_contents.bytes });
341            best_contents.is_generic = true;
342            while (contents_list.pop()) |contender| {
343                if (contender.hit_count > 1) {
344                    const this_missed_bytes = contender.hit_count * contender.bytes.len;
345                    missed_opportunity_bytes += this_missed_bytes;
346                    std.debug.print("Missed opportunity ({B}): {s}\n", .{
347                        this_missed_bytes,
348                        path_kv.key_ptr.*,
349                    });
350                } else break;
351            }
352        }
353        var hash_it = path_kv.value_ptr.*.iterator();
354        while (hash_it.next()) |hash_kv| {
355            const contents = hash_to_contents.get(hash_kv.value_ptr.*).?;
356            if (contents.is_generic) continue;
357
358            const dest_target = hash_kv.key_ptr.*;
359            const full_path = try std.fs.path.join(allocator, &[_][]const u8{ out_dir, dest_target, path_kv.key_ptr.* });
360            try std.fs.cwd().makePath(std.fs.path.dirname(full_path).?);
361            try std.fs.cwd().writeFile(.{ .sub_path = full_path, .data = contents.bytes });
362        }
363    }
364}
365
366fn usageAndExit(arg0: []const u8) noreturn {
367    std.debug.print("Usage: {s} [--search-path <dir>] --out <dir> --abi <name>\n", .{arg0});
368    std.debug.print("--search-path can be used any number of times.\n", .{});
369    std.debug.print("    subdirectories of search paths look like, e.g. x86_64-linux-gnu\n", .{});
370    std.debug.print("--out is a dir that will be created, and populated with the results\n", .{});
371    std.debug.print("--abi is either glibc, musl, freebsd, or netbsd\n", .{});
372    std.process.exit(1);
373}