Commit `d6a88ed74d`

Andrew Kelley <andrew@ziglang.org>

2025-02-25 05:24:52

introduce package id and redo hash format again

Introduces the `id` field to `build.zig.zon`. Together with name, this represents a globally unique package identifier. This field should be initialized with a 16-bit random number when the package is first created, and then *never change*. This allows Zig to unambiguously detect when one package is an updated version of another. When forking a Zig project, this id should be regenerated with a new random number if the upstream project is still maintained. Otherwise, the fork is *hostile*, attempting to take control over the original project's identity. `0x0000` is invalid because it obviously means a random number wasn't used. `0xffff` is reserved to represent "naked" packages. Tracking issue #14288 Additionally: * Fix bad path in error messages regarding build.zig.zon file. * Manifest validates that `name` and `version` field of build.zig.zon are maximum 32 bytes. * Introduce error for root package to not switch to enum literal for name. * Introduce error for root package to omit `id`. * Update init template to generate `id` * Update init template to populate `minimum_zig_version`. * New package hash format changes: - name and version limited to 32 bytes via error rather than truncation - truncate sha256 to 192 bits rather than 40 bits - include the package id This means that, given only the package hashes for a complete dependency tree, it is possible to perform version selection and know the final size on disk, without doing any fetching whatsoever. This prevents wasted bandwidth since package versions not selected do not need to be fetched.

master

1 parent 9763dd2

Changed files (8)

doc

build.zig.zon.md

lib

init

src

main.zig

build.zig

build.zig.zon

src

Package

@@ -10,7 +10,7 @@ build.zig.
 
 ### `name`
 
-String. Required.
+Enum literal. Required.
 
 This is the default name used by packages depending on this one. For example,
 when a user runs `zig fetch --save <url>`, this field is used as the key in the
@@ -20,12 +20,31 @@ will stick with this provided value.
 It is redundant to include "zig" in this name because it is already within the
 Zig package namespace.
 
+Must be a valid bare Zig identifier (don't `@` me), limited to 32 bytes.
+
+### `id`
+
+Together with name, this represents a globally unique package identifier. This
+field should be initialized with a 16-bit random number when the package is
+first created, and then *never change*. This allows Zig to unambiguously detect
+when one package is an updated version of another.
+
+When forking a Zig project, this id should be regenerated with a new random
+number if the upstream project is still maintained. Otherwise, the fork is
+*hostile*, attempting to take control over the original project's identity.
+
+`0x0000` is invalid because it obviously means a random number wasn't used.
+
+`0xffff` is reserved to represent "naked" packages.
+
 ### `version`
 
 String. Required.
 
 [semver](https://semver.org/)
 
+Limited to 32 bytes.
+
 ### `minimum_zig_version`
 
 String. Optional.

@@ -43,4 +43,4 @@ test "fuzz example" {
 const std = @import("std");
 
 /// This imports the separate module containing `root.zig`. Take a look in `build.zig` for details.
-const lib = @import("$_lib");
+const lib = @import("$n_lib");

@@ -42,14 +42,14 @@ pub fn build(b: *std.Build) void {
     // Modules can depend on one another using the `std.Build.Module.addImport` function.
     // This is what allows Zig source code to use `@import("foo")` where 'foo' is not a
     // file path. In this case, we set up `exe_mod` to import `lib_mod`.
-    exe_mod.addImport("$_lib", lib_mod);
+    exe_mod.addImport("$n_lib", lib_mod);
 
     // Now, we will create a static library based on the module we created above.
     // This creates a `std.Build.Step.Compile`, which is the build step responsible
     // for actually invoking the compiler.
     const lib = b.addLibrary(.{
         .linkage = .static,
-        .name = "$",
+        .name = "$n",
         .root_module = lib_mod,
     });
 
@@ -61,7 +61,7 @@ pub fn build(b: *std.Build) void {
     // This creates another `std.Build.Step.Compile`, but this one builds an executable
     // rather than a static library.
     const exe = b.addExecutable(.{
-        .name = "$",
+        .name = "$n",
         .root_module = exe_mod,
     });

@@ -6,12 +6,29 @@
     //
     // It is redundant to include "zig" in this name because it is already
     // within the Zig package namespace.
-    .name = "$",
+    .name = .$n,
 
     // This is a [Semantic Version](https://semver.org/).
     // In a future version of Zig it will be used for package deduplication.
     .version = "0.0.0",
 
+    // Together with name, this represents a globally unique package
+    // identifier. This field should be initialized with a 16-bit random number
+    // when the package is first created, and then *never change*. This allows
+    // unambiguous detection when one package is an updated version of another.
+    //
+    // When forking a Zig project, this id should be regenerated with a new
+    // random number if the upstream project is still maintained. Otherwise,
+    // the fork is *hostile*, attempting to take control over the original
+    // project's identity. Thus it is recommended to leave the comment on the
+    // following line intact, so that it shows up in code reviews that modify
+    // the field.
+    .id = $i, // Changing this has security and trust implications.
+
+    // Tracks the earliest Zig version that the package considers to be a
+    // supported use case.
+    .minimum_zig_version = "$v",
+
     // This field is optional.
     // This is currently advisory only; Zig does not yet do anything
     // with this value.

@@ -586,9 +586,11 @@ pub fn computedPackageHash(f: *const Fetch) Package.Hash {
     if (f.manifest) |man| {
         var version_buffer: [32]u8 = undefined;
         const version: []const u8 = std.fmt.bufPrint(&version_buffer, "{}", .{man.version}) catch &version_buffer;
-        return .init(f.computed_hash.digest, man.name, version, saturated_size);
+        return .init(f.computed_hash.digest, man.name, version, man.id, saturated_size);
     }
-    return .initNaked(f.computed_hash.digest, saturated_size);
+    // In the future build.zig.zon fields will be added to allow overriding these values
+    // for naked tarballs.
+    return .init(f.computed_hash.digest, "N", "V", 0xffff, saturated_size);
 }
 
 /// `computeHash` gets a free check for the existence of `build.zig`, but when
@@ -645,11 +647,13 @@ fn loadManifest(f: *Fetch, pkg_root: Cache.Path) RunError!void {
 
     f.manifest = try Manifest.parse(arena, ast.*, .{
         .allow_missing_paths_field = f.allow_missing_paths_field,
+        .allow_missing_id = f.allow_missing_paths_field,
+        .allow_name_string = f.allow_missing_paths_field,
     });
     const manifest = &f.manifest.?;
 
     if (manifest.errors.len > 0) {
-        const src_path = try eb.printString("{}{s}", .{ pkg_root, Manifest.basename });
+        const src_path = try eb.printString("{}" ++ fs.path.sep_str ++ "{s}", .{ pkg_root, Manifest.basename });
         try manifest.copyErrorsIntoBundle(ast.*, src_path, eb);
         return error.FetchFailed;
     }

@@ -36,6 +36,7 @@ pub const ErrorMessage = struct {
 };
 
 name: []const u8,
+id: u16,
 version: std.SemanticVersion,
 version_node: Ast.Node.Index,
 dependencies: std.StringArrayHashMapUnmanaged(Dependency),
@@ -50,6 +51,8 @@ pub const ParseOptions = struct {
     allow_missing_paths_field: bool = false,
     /// Deprecated, to be removed after 0.14.0 is tagged.
     allow_name_string: bool = true,
+    /// Deprecated, to be removed after 0.14.0 is tagged.
+    allow_missing_id: bool = true,
 };
 
 pub const Error = Allocator.Error;
@@ -70,6 +73,7 @@ pub fn parse(gpa: Allocator, ast: Ast, options: ParseOptions) Error!Manifest {
         .errors = .{},
 
         .name = undefined,
+        .id = 0,
         .version = undefined,
         .version_node = 0,
         .dependencies = .{},
@@ -77,6 +81,7 @@ pub fn parse(gpa: Allocator, ast: Ast, options: ParseOptions) Error!Manifest {
         .paths = .{},
         .allow_missing_paths_field = options.allow_missing_paths_field,
         .allow_name_string = options.allow_name_string,
+        .allow_missing_id = options.allow_missing_id,
         .minimum_zig_version = null,
         .buf = .{},
     };
@@ -92,6 +97,7 @@ pub fn parse(gpa: Allocator, ast: Ast, options: ParseOptions) Error!Manifest {
 
     return .{
         .name = p.name,
+        .id = p.id,
         .version = p.version,
         .version_node = p.version_node,
         .dependencies = try p.dependencies.clone(p.arena),
@@ -143,6 +149,7 @@ const Parse = struct {
     errors: std.ArrayListUnmanaged(ErrorMessage),
 
     name: []const u8,
+    id: u16,
     version: std.SemanticVersion,
     version_node: Ast.Node.Index,
     dependencies: std.StringArrayHashMapUnmanaged(Dependency),
@@ -150,6 +157,7 @@ const Parse = struct {
     paths: std.StringArrayHashMapUnmanaged(void),
     allow_missing_paths_field: bool,
     allow_name_string: bool,
+    allow_missing_id: bool,
     minimum_zig_version: ?std.SemanticVersion,
 
     const InnerError = error{ ParseFailure, OutOfMemory };
@@ -167,6 +175,7 @@ const Parse = struct {
         var have_name = false;
         var have_version = false;
         var have_included_paths = false;
+        var have_id = false;
 
         for (struct_init.ast.fields) |field_init| {
             const name_token = ast.firstToken(field_init) - 2;
@@ -183,6 +192,9 @@ const Parse = struct {
             } else if (mem.eql(u8, field_name, "name")) {
                 p.name = try parseName(p, field_init);
                 have_name = true;
+            } else if (mem.eql(u8, field_name, "id")) {
+                p.id = try parseId(p, field_init);
+                have_id = true;
             } else if (mem.eql(u8, field_name, "version")) {
                 p.version_node = field_init;
                 const version_text = try parseString(p, field_init);
@@ -206,6 +218,12 @@ const Parse = struct {
             }
         }
 
+        if (!have_id and !p.allow_missing_id) {
+            try appendError(p, main_token, "missing top-level 'id' field; suggested value: 0x{x}", .{
+                Package.randomId(),
+            });
+        }
+
         if (!have_name) {
             try appendError(p, main_token, "missing top-level 'name' field", .{});
         }
@@ -359,6 +377,33 @@ const Parse = struct {
         }
     }
 
+    fn parseId(p: *Parse, node: Ast.Node.Index) !u16 {
+        const ast = p.ast;
+        const node_tags = ast.nodes.items(.tag);
+        const main_tokens = ast.nodes.items(.main_token);
+        const main_token = main_tokens[node];
+        if (node_tags[node] != .number_literal) {
+            return fail(p, main_token, "expected integer literal", .{});
+        }
+        const token_bytes = ast.tokenSlice(main_token);
+        const parsed = std.zig.parseNumberLiteral(token_bytes);
+        const n = switch (parsed) {
+            .int => |n| n,
+            .big_int, .float => return fail(p, main_token, "expected u16 integer literal, found {s}", .{
+                @tagName(parsed),
+            }),
+            .failure => |err| return fail(p, main_token, "bad integer literal: {s}", .{@tagName(err)}),
+        };
+        const casted = std.math.cast(u16, n) orelse
+            return fail(p, main_token, "integer value {d} does not fit into u16", .{n});
+        switch (casted) {
+            0x0000, 0xffff => return fail(p, main_token, "id value 0x{x} reserved; use 0x{x} instead", .{
+                casted, Package.randomId(),
+            }),
+            else => return casted,
+        }
+    }
+
     fn parseName(p: *Parse, node: Ast.Node.Index) ![]const u8 {
         const ast = p.ast;
         const node_tags = ast.nodes.items(.tag);
@@ -371,7 +416,7 @@ const Parse = struct {
                 return fail(p, main_token, "name must be a valid bare zig identifier (hint: switch from string to enum literal)", .{});
 
             if (name.len > max_name_len)
-                return fail(p, main_token, "name '{s}' exceeds max length of {d}", .{
+                return fail(p, main_token, "name '{}' exceeds max length of {d}", .{
                     std.zig.fmtId(name), max_name_len,
                 });
 
@@ -386,7 +431,7 @@ const Parse = struct {
             return fail(p, main_token, "name must be a valid bare zig identifier", .{});
 
         if (ident_name.len > max_name_len)
-            return fail(p, main_token, "name '{s}' exceeds max length of {d}", .{
+            return fail(p, main_token, "name '{}' exceeds max length of {d}", .{
                 std.zig.fmtId(ident_name), max_name_len,
             });

@@ -4751,8 +4751,10 @@ fn cmdInit(gpa: Allocator, arena: Allocator, args: []const []const u8) !void {
     };
     var ok_count: usize = 0;
 
+    const id = Package.randomId();
+
     for (template_paths) |template_path| {
-        if (templates.write(arena, fs.cwd(), cwd_basename, template_path)) |_| {
+        if (templates.write(arena, fs.cwd(), cwd_basename, template_path, id)) |_| {
             std.log.info("created {s}", .{template_path});
             ok_count += 1;
         } else |err| switch (err) {
@@ -7430,10 +7432,10 @@ fn loadManifest(
             0,
         ) catch |err| switch (err) {
             error.FileNotFound => {
+                const id = Package.randomId();
                 var templates = findTemplates(gpa, arena);
                 defer templates.deinit();
-
-                templates.write(arena, options.dir, options.root_name, Package.Manifest.basename) catch |e| {
+                templates.write(arena, options.dir, options.root_name, Package.Manifest.basename, id) catch |e| {
                     fatal("unable to write {s}: {s}", .{
                         Package.Manifest.basename, @errorName(e),
                     });
@@ -7491,6 +7493,7 @@ const Templates = struct {
         out_dir: fs.Dir,
         root_name: []const u8,
         template_path: []const u8,
+        id: u16,
     ) !void {
         if (fs.path.dirname(template_path)) |dirname| {
             out_dir.makePath(dirname) catch |err| {
@@ -7504,13 +7507,28 @@ const Templates = struct {
         };
         templates.buffer.clearRetainingCapacity();
         try templates.buffer.ensureUnusedCapacity(contents.len);
-        for (contents) |c| {
-            if (c == '$') {
-                try templates.buffer.appendSlice(root_name);
-            } else {
-                try templates.buffer.append(c);
-            }
-        }
+        var state: enum { start, dollar } = .start;
+        for (contents) |c| switch (state) {
+            .start => switch (c) {
+                '$' => state = .dollar,
+                else => try templates.buffer.append(c),
+            },
+            .dollar => switch (c) {
+                'n' => {
+                    try templates.buffer.appendSlice(root_name);
+                    state = .start;
+                },
+                'i' => {
+                    try templates.buffer.writer().print("0x{x}", .{id});
+                    state = .start;
+                },
+                'v' => {
+                    try templates.buffer.appendSlice(build_options.version);
+                    state = .start;
+                },
+                else => fatal("unknown substitution: ${c}", .{c}),
+            },
+        };
 
         return out_dir.writeFile(.{
             .sub_path = template_path,

@@ -10,9 +10,17 @@ pub const multihash_len = 1 + 1 + Hash.Algo.digest_length;
 pub const multihash_hex_digest_len = 2 * multihash_len;
 pub const MultiHashHexDigest = [multihash_hex_digest_len]u8;
 
+pub fn randomId() u16 {
+    return std.crypto.random.intRangeLessThan(u16, 0x0001, 0xffff);
+}
+
 /// A user-readable, file system safe hash that identifies an exact package
 /// snapshot, including file contents.
 ///
+/// The hash is not only to prevent collisions but must resist attacks where
+/// the adversary fully controls the contents being hashed. Thus, it contains
+/// a full SHA-256 digest.
+///
 /// This data structure can be used to store the legacy hash format too. Legacy
 /// hash format is scheduled to be removed after 0.14.0 is tagged.
 ///
@@ -26,7 +34,8 @@ pub const Hash = struct {
     pub const Algo = std.crypto.hash.sha2.Sha256;
     pub const Digest = [Algo.digest_length]u8;
 
-    pub const max_len = 32 + 1 + 32 + 1 + 12;
+    /// Example: "nnnn-vvvv-hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh"
+    pub const max_len = 32 + 1 + 32 + 1 + (16 + 32 + 192) / 6;
 
     pub fn fromSlice(s: []const u8) Hash {
         assert(s.len <= max_len);
@@ -62,48 +71,35 @@ pub const Hash = struct {
         try std.testing.expect(h.isOld());
     }
 
-    /// Produces "$name-$semver-$sizedhash".
+    /// Produces "$name-$semver-$hashplus".
     /// * name is the name field from build.zig.zon, truncated at 32 bytes and must
     ///   be a valid zig identifier
     /// * semver is the version field from build.zig.zon, truncated at 32 bytes
-    /// * sizedhash is the following 9-byte array, base64 encoded using -_ to make
+    /// * hashplus is the following 39-byte array, base64 encoded using -_ to make
     ///   it filesystem safe:
-    ///   - (4 bytes) LE u32 total decompressed size in bytes
-    ///   - (5 bytes) truncated SHA-256 of hashed files of the package
+    ///   - (2 bytes) LE u16 Package ID
+    ///   - (4 bytes) LE u32 total decompressed size in bytes, overflow saturated
+    ///   - (24 bytes) truncated SHA-256 digest of hashed files of the package
     ///
-    /// example: "nasm-2.16.1-2-BWdcABvF_jM1"
-    pub fn init(digest: Digest, name: []const u8, ver: []const u8, size: u32) Hash {
+    /// example: "nasm-2.16.1-3-AAD_ZlwACpGU-c3QXp_yNyn07Q5U9Rq-Cb1ur2G1"
+    pub fn init(digest: Digest, name: []const u8, ver: []const u8, id: u16, size: u32) Hash {
+        assert(name.len <= 32);
+        assert(ver.len <= 32);
         var result: Hash = undefined;
         var buf: std.ArrayListUnmanaged(u8) = .initBuffer(&result.bytes);
-        buf.appendSliceAssumeCapacity(name[0..@min(name.len, 32)]);
+        buf.appendSliceAssumeCapacity(name);
         buf.appendAssumeCapacity('-');
-        buf.appendSliceAssumeCapacity(ver[0..@min(ver.len, 32)]);
+        buf.appendSliceAssumeCapacity(ver);
         buf.appendAssumeCapacity('-');
-        var sizedhash: [9]u8 = undefined;
-        std.mem.writeInt(u32, sizedhash[0..4], size, .little);
-        sizedhash[4..].* = digest[0..5].*;
-        _ = std.base64.url_safe_no_pad.Encoder.encode(buf.addManyAsArrayAssumeCapacity(12), &sizedhash);
+        var hashplus: [30]u8 = undefined;
+        std.mem.writeInt(u16, hashplus[0..2], id, .little);
+        std.mem.writeInt(u32, hashplus[2..6], size, .little);
+        hashplus[6..].* = digest[0..24].*;
+        _ = std.base64.url_safe_no_pad.Encoder.encode(buf.addManyAsArrayAssumeCapacity(40), &hashplus);
         @memset(buf.unusedCapacitySlice(), 0);
         return result;
     }
 
-    /// Produces "$hashiname-N-$sizedhash". For packages that lack "build.zig.zon" metadata.
-    /// * hashiname is [5..][0..24] bytes of the SHA-256, urlsafe-base64-encoded, for a total of 32 bytes encoded
-    /// * the semver section is replaced with a hardcoded N which stands for
-    ///   "naked". It acts as a version number so that any future updates to the
-    ///   hash format can tell this hash format apart. Note that "N" is an
-    ///   invalid semver.
-    /// * sizedhash is the same as in `init`.
-    ///
-    /// The hash is broken up this way so that "sizedhash" can be calculated
-    /// exactly the same way in both cases, and so that "name" and "hashiname" can
-    /// be used interchangeably in both cases.
-    pub fn initNaked(digest: Digest, size: u32) Hash {
-        var name: [32]u8 = undefined;
-        _ = std.base64.url_safe_no_pad.Encoder.encode(&name, digest[5..][0..24]);
-        return init(digest, &name, "N", size);
-    }
-
     /// Produces a unique hash based on the path provided. The result should
     /// not be user-visible.
     pub fn initPath(sub_path: []const u8, is_global: bool) Hash {
@@ -144,7 +140,7 @@ pub const MultihashFunction = enum(u16) {
 
 pub const multihash_function: MultihashFunction = switch (Hash.Algo) {
     std.crypto.hash.sha2.Sha256 => .@"sha2-256",
-    else => @compileError("unreachable"),
+    else => unreachable,
 };
 
 pub fn multiHashHexDigest(digest: Hash.Digest) MultiHashHexDigest {

Commit d6a88ed74d

Commit `d6a88ed74d`