Commit cf88cf2657

Andrew Kelley <andrew@ziglang.org>
2022-01-31 09:10:50
std: make ArrayHashMap eql function accept an additional param
which is the index of the key that already exists in the hash map. This enables the use case of using `AutoArrayHashMap(void, void)` which may seem surprising at first, but is actually pretty handy! This commit includes a proof-of-concept of how I want to use it, with a new InternArena abstraction for stage2 that provides a compact way to store values (and types) in an "internment arena", thus making types stored exactly once (per arena), representable with a single u32 as a reference to a type within an InternArena, and comparable with a simple u32 integer comparison. If both types are in the same InternArena, you can check if they are equal by seeing if their index is the same. What's neat about `AutoArrayHashMap(void, void)` is that it allows us to look up the indexes by key, *without actually storing the keys*. Instead, keys are treated as ephemeral values that are constructed as needed. As a result, we have an extremely efficient encoding of types and values, represented only by three arrays, which has no pointers, and can therefore be serialized and deserialized by a single writev/readv call. The `map` field is denormalized data and can be computed from the other two fields. This is in contrast to our current Type/Value system which makes extensive use of pointers. The test at the bottom of InternArena.zig passes in this commit.
1 parent 91ad96b
lib/std/hash/auto_hash.zig
@@ -81,7 +81,6 @@ pub fn hash(hasher: anytype, key: anytype, comptime strat: HashStrategy) void {
         .NoReturn,
         .Opaque,
         .Undefined,
-        .Void,
         .Null,
         .ComptimeFloat,
         .ComptimeInt,
@@ -91,6 +90,8 @@ pub fn hash(hasher: anytype, key: anytype, comptime strat: HashStrategy) void {
         .Float,
         => @compileError("unable to hash type " ++ @typeName(Key)),
 
+        .Void => return,
+
         // Help the optimizer see that hashing an int is easy by inlining!
         // TODO Check if the situation is better after #561 is resolved.
         .Int => {
lib/std/array_hash_map.zig
@@ -37,8 +37,9 @@ pub const StringContext = struct {
         _ = self;
         return hashString(s);
     }
-    pub fn eql(self: @This(), a: []const u8, b: []const u8) bool {
+    pub fn eql(self: @This(), a: []const u8, b: []const u8, b_index: usize) bool {
         _ = self;
+        _ = b_index;
         return eqlString(a, b);
     }
 };
@@ -76,7 +77,7 @@ pub fn ArrayHashMap(
     comptime Context: type,
     comptime store_hash: bool,
 ) type {
-    comptime std.hash_map.verifyContext(Context, K, K, u32);
+    comptime std.hash_map.verifyContext(Context, K, K, u32, true);
     return struct {
         unmanaged: Unmanaged,
         allocator: Allocator,
@@ -462,7 +463,7 @@ pub fn ArrayHashMapUnmanaged(
     comptime Context: type,
     comptime store_hash: bool,
 ) type {
-    comptime std.hash_map.verifyContext(Context, K, K, u32);
+    comptime std.hash_map.verifyContext(Context, K, K, u32, true);
     return struct {
         /// It is permitted to access this field directly.
         entries: DataList = .{},
@@ -700,7 +701,7 @@ pub fn ArrayHashMapUnmanaged(
                 const hashes_array = slice.items(.hash);
                 const keys_array = slice.items(.key);
                 for (keys_array) |*item_key, i| {
-                    if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*)) {
+                    if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*, i)) {
                         return GetOrPutResult{
                             .key_ptr = item_key,
                             // workaround for #6974
@@ -933,7 +934,7 @@ pub fn ArrayHashMapUnmanaged(
                 const hashes_array = slice.items(.hash);
                 const keys_array = slice.items(.key);
                 for (keys_array) |*item_key, i| {
-                    if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*)) {
+                    if (hashes_array[i] == h and checkedEql(ctx, key, item_key.*, i)) {
                         return i;
                     }
                 }
@@ -1245,7 +1246,7 @@ pub fn ArrayHashMapUnmanaged(
                 const keys_array = slice.items(.key);
                 for (keys_array) |*item_key, i| {
                     const hash_match = if (store_hash) hashes_array[i] == key_hash else true;
-                    if (hash_match and key_ctx.eql(key, item_key.*)) {
+                    if (hash_match and key_ctx.eql(key, item_key.*, i)) {
                         const removed_entry: KV = .{
                             .key = keys_array[i],
                             .value = slice.items(.value)[i],
@@ -1286,7 +1287,7 @@ pub fn ArrayHashMapUnmanaged(
                 const keys_array = slice.items(.key);
                 for (keys_array) |*item_key, i| {
                     const hash_match = if (store_hash) hashes_array[i] == key_hash else true;
-                    if (hash_match and key_ctx.eql(key, item_key.*)) {
+                    if (hash_match and key_ctx.eql(key, item_key.*, i)) {
                         switch (removal_type) {
                             .swap => self.entries.swapRemove(i),
                             .ordered => self.entries.orderedRemove(i),
@@ -1483,8 +1484,9 @@ pub fn ArrayHashMapUnmanaged(
 
                 // This pointer survives the following append because we call
                 // entries.ensureTotalCapacity before getOrPutInternal.
-                const hash_match = if (store_hash) h == hashes_array[slot_data.entry_index] else true;
-                if (hash_match and checkedEql(ctx, key, keys_array[slot_data.entry_index])) {
+                const i = slot_data.entry_index;
+                const hash_match = if (store_hash) h == hashes_array[i] else true;
+                if (hash_match and checkedEql(ctx, key, keys_array[i], i)) {
                     return .{
                         .found_existing = true,
                         .key_ptr = &keys_array[slot_data.entry_index],
@@ -1571,8 +1573,9 @@ pub fn ArrayHashMapUnmanaged(
                 if (slot_data.isEmpty() or slot_data.distance_from_start_index < distance_from_start_index)
                     return null;
 
-                const hash_match = if (store_hash) h == hashes_array[slot_data.entry_index] else true;
-                if (hash_match and checkedEql(ctx, key, keys_array[slot_data.entry_index]))
+                const i = slot_data.entry_index;
+                const hash_match = if (store_hash) h == hashes_array[i] else true;
+                if (hash_match and checkedEql(ctx, key, keys_array[i], i))
                     return slot;
             }
             unreachable;
@@ -1624,7 +1627,7 @@ pub fn ArrayHashMapUnmanaged(
         }
 
         inline fn checkedHash(ctx: anytype, key: anytype) u32 {
-            comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(key), K, u32);
+            comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(key), K, u32, true);
             // If you get a compile error on the next line, it means that
             const hash = ctx.hash(key); // your generic hash function doesn't accept your key
             if (@TypeOf(hash) != u32) {
@@ -1633,10 +1636,10 @@ pub fn ArrayHashMapUnmanaged(
             }
             return hash;
         }
-        inline fn checkedEql(ctx: anytype, a: anytype, b: K) bool {
-            comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(a), K, u32);
+        inline fn checkedEql(ctx: anytype, a: anytype, b: K, b_index: usize) bool {
+            comptime std.hash_map.verifyContext(@TypeOf(ctx), @TypeOf(a), K, u32, true);
             // If you get a compile error on the next line, it means that
-            const eql = ctx.eql(a, b); // your generic eql function doesn't accept (self, adapt key, K)
+            const eql = ctx.eql(a, b, b_index); // your generic eql function doesn't accept (self, adapt key, K, index)
             if (@TypeOf(eql) != bool) {
                 @compileError("Context " ++ @typeName(@TypeOf(ctx)) ++ " has a generic eql function that returns the wrong type!\n" ++
                     @typeName(bool) ++ " was expected, but found " ++ @typeName(@TypeOf(eql)));
@@ -2255,9 +2258,10 @@ pub fn getAutoHashFn(comptime K: type, comptime Context: type) (fn (Context, K)
     }.hash;
 }
 
-pub fn getAutoEqlFn(comptime K: type, comptime Context: type) (fn (Context, K, K) bool) {
+pub fn getAutoEqlFn(comptime K: type, comptime Context: type) (fn (Context, K, K, usize) bool) {
     return struct {
-        fn eql(ctx: Context, a: K, b: K) bool {
+        fn eql(ctx: Context, a: K, b: K, b_index: usize) bool {
+            _ = b_index;
             _ = ctx;
             return meta.eql(a, b);
         }
lib/std/builtin.zig
@@ -202,12 +202,14 @@ pub const TypeInfo = union(enum) {
     /// therefore must be kept in sync with the compiler implementation.
     pub const Int = struct {
         signedness: Signedness,
+        /// TODO make this u16 instead of comptime_int
         bits: comptime_int,
     };
 
     /// This data structure is used by the Zig language code generation and
     /// therefore must be kept in sync with the compiler implementation.
     pub const Float = struct {
+        /// TODO make this u16 instead of comptime_int
         bits: comptime_int,
     };
 
@@ -217,6 +219,7 @@ pub const TypeInfo = union(enum) {
         size: Size,
         is_const: bool,
         is_volatile: bool,
+        /// TODO make this u16 instead of comptime_int
         alignment: comptime_int,
         address_space: AddressSpace,
         child: type,
lib/std/hash_map.zig
@@ -131,7 +131,13 @@ pub const default_max_load_percentage = 80;
 /// If you are passing a context to a *Adapted function, PseudoKey is the type
 /// of the key parameter.  Otherwise, when creating a HashMap or HashMapUnmanaged
 /// type, PseudoKey = Key = K.
-pub fn verifyContext(comptime RawContext: type, comptime PseudoKey: type, comptime Key: type, comptime Hash: type) void {
+pub fn verifyContext(
+    comptime RawContext: type,
+    comptime PseudoKey: type,
+    comptime Key: type,
+    comptime Hash: type,
+    comptime is_array: bool,
+) void {
     comptime {
         var allow_const_ptr = false;
         var allow_mutable_ptr = false;
@@ -166,7 +172,9 @@ pub fn verifyContext(comptime RawContext: type, comptime PseudoKey: type, compti
             const prefix = "\n  ";
             const deep_prefix = prefix ++ "  ";
             const hash_signature = "fn (self, " ++ @typeName(PseudoKey) ++ ") " ++ @typeName(Hash);
-            const eql_signature = "fn (self, " ++ @typeName(PseudoKey) ++ ", " ++ @typeName(Key) ++ ") bool";
+            const index_param = if (is_array) ", b_index: usize" else "";
+            const eql_signature = "fn (self, " ++ @typeName(PseudoKey) ++ ", " ++
+                @typeName(Key) ++ index_param ++ ") bool";
             const err_invalid_hash_signature = prefix ++ @typeName(Context) ++ ".hash must be " ++ hash_signature ++
                 deep_prefix ++ "but is actually " ++ @typeName(@TypeOf(Context.hash));
             const err_invalid_eql_signature = prefix ++ @typeName(Context) ++ ".eql must be " ++ eql_signature ++
@@ -255,7 +263,8 @@ pub fn verifyContext(comptime RawContext: type, comptime PseudoKey: type, compti
             const info = @typeInfo(@TypeOf(eql));
             if (info == .Fn) {
                 const func = info.Fn;
-                if (func.args.len != 3) {
+                const args_len = if (is_array) 4 else 3;
+                if (func.args.len != args_len) {
                     errors = errors ++ lazy.err_invalid_eql_signature;
                 } else {
                     var emitted_signature = false;
@@ -360,7 +369,7 @@ pub fn HashMap(
     comptime Context: type,
     comptime max_load_percentage: u64,
 ) type {
-    comptime verifyContext(Context, K, K, u64);
+    comptime verifyContext(Context, K, K, u64, false);
     return struct {
         unmanaged: Unmanaged,
         allocator: Allocator,
@@ -683,7 +692,7 @@ pub fn HashMapUnmanaged(
 ) type {
     if (max_load_percentage <= 0 or max_load_percentage >= 100)
         @compileError("max_load_percentage must be between 0 and 100.");
-    comptime verifyContext(Context, K, K, u64);
+    comptime verifyContext(Context, K, K, u64, false);
 
     return struct {
         const Self = @This();
@@ -1108,7 +1117,7 @@ pub fn HashMapUnmanaged(
         /// from this function.  To encourage that, this function is
         /// marked as inline.
         inline fn getIndex(self: Self, key: anytype, ctx: anytype) ?usize {
-            comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash);
+            comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash, false);
 
             if (self.size == 0) {
                 return null;
@@ -1291,7 +1300,7 @@ pub fn HashMapUnmanaged(
             return result;
         }
         pub fn getOrPutAssumeCapacityAdapted(self: *Self, key: anytype, ctx: anytype) GetOrPutResult {
-            comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash);
+            comptime verifyContext(@TypeOf(ctx), @TypeOf(key), K, Hash, false);
 
             // If you get a compile error on this line, it means that your generic hash
             // function is invalid for these parameters.
src/InternArena.zig
@@ -0,0 +1,315 @@
+map: std.AutoArrayHashMapUnmanaged(void, void) = .{},
+items: std.MultiArrayList(Item) = .{},
+extra: std.ArrayListUnmanaged(u32) = .{},
+
+const InternArena = @This();
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const assert = std.debug.assert;
+
+const KeyAdapter = struct {
+    intern_arena: *const InternArena,
+
+    pub fn eql(ctx: @This(), a: Key, b_void: void, b_map_index: usize) bool {
+        _ = b_void;
+        return ctx.intern_arena.indexToKey(@intToEnum(Index, b_map_index)).eql(a);
+    }
+
+    pub fn hash(ctx: @This(), a: Key) u32 {
+        _ = ctx;
+        return a.hash();
+    }
+};
+
+pub const Key = union(enum) {
+    int_type: struct {
+        signedness: std.builtin.Signedness,
+        bits: u16,
+    },
+    ptr_type: struct {
+        elem_type: Index,
+        sentinel: Index,
+        alignment: u16,
+        size: std.builtin.TypeInfo.Pointer.Size,
+        is_const: bool,
+        is_volatile: bool,
+        is_allowzero: bool,
+        address_space: std.builtin.AddressSpace,
+    },
+    array_type: struct {
+        len: u64,
+        child: Index,
+        sentinel: Index,
+    },
+    vector_type: struct {
+        len: u32,
+        child: Index,
+    },
+    optional_type: struct {
+        payload_type: Index,
+    },
+    error_union_type: struct {
+        error_set_type: Index,
+        payload_type: Index,
+    },
+    simple: Simple,
+
+    pub fn hash(key: Key) u32 {
+        var hasher = std.hash.Wyhash.init(0);
+        switch (key) {
+            .int_type => |int_type| {
+                std.hash.autoHash(&hasher, int_type);
+            },
+            .array_type => |array_type| {
+                std.hash.autoHash(&hasher, array_type);
+            },
+            else => @panic("TODO"),
+        }
+        return @truncate(u32, hasher.final());
+    }
+
+    pub fn eql(a: Key, b: Key) bool {
+        const KeyTag = std.meta.Tag(Key);
+        const a_tag: KeyTag = a;
+        const b_tag: KeyTag = b;
+        if (a_tag != b_tag) return false;
+        switch (a) {
+            .int_type => |a_info| {
+                const b_info = b.int_type;
+                return std.meta.eql(a_info, b_info);
+            },
+            .array_type => |a_info| {
+                const b_info = b.array_type;
+                return std.meta.eql(a_info, b_info);
+            },
+            else => @panic("TODO"),
+        }
+    }
+};
+
+pub const Item = struct {
+    tag: Tag,
+    /// The doc comments on the respective Tag explain how to interpret this.
+    data: u32,
+};
+
+/// Represents an index into `map`. It represents the canonical index
+/// of a `Value` within this `InternArena`. The values are typed.
+/// Two values which have the same type can be equality compared simply
+/// by checking if their indexes are equal, provided they are both in
+/// the same `InternArena`.
+pub const Index = enum(u32) {
+    none = std.math.maxInt(u32),
+    _,
+};
+
+pub const Tag = enum(u8) {
+    /// An integer type.
+    /// data is number of bits
+    type_int_signed,
+    /// An integer type.
+    /// data is number of bits
+    type_int_unsigned,
+    /// An array type.
+    /// data is payload to Array.
+    type_array,
+    /// A type or value that can be represented with only an enum tag.
+    /// data is Simple enum value
+    simple,
+    /// An unsigned integer value that can be represented by u32.
+    /// data is integer value
+    int_u32,
+    /// An unsigned integer value that can be represented by i32.
+    /// data is integer value bitcasted to u32.
+    int_i32,
+    /// A positive integer value that does not fit in 32 bits.
+    /// data is a extra index to BigInt.
+    int_big_positive,
+    /// A negative integer value that does not fit in 32 bits.
+    /// data is a extra index to BigInt.
+    int_big_negative,
+    /// A float value that can be represented by f32.
+    /// data is float value bitcasted to u32.
+    float_f32,
+    /// A float value that can be represented by f64.
+    /// data is payload index to Float64.
+    float_f64,
+    /// A float value that can be represented by f128.
+    /// data is payload index to Float128.
+    float_f128,
+};
+
+pub const Simple = enum(u32) {
+    f16,
+    f32,
+    f64,
+    f80,
+    f128,
+    usize,
+    isize,
+    c_short,
+    c_ushort,
+    c_int,
+    c_uint,
+    c_long,
+    c_ulong,
+    c_longlong,
+    c_ulonglong,
+    c_longdouble,
+    anyopaque,
+    bool,
+    void,
+    type,
+    anyerror,
+    comptime_int,
+    comptime_float,
+    noreturn,
+    @"anyframe",
+    null_type,
+    undefined_type,
+    enum_literal_type,
+    @"undefined",
+    void_value,
+    @"null",
+    bool_true,
+    bool_false,
+};
+
+pub const Array = struct {
+    len: u32,
+    child: Index,
+};
+
+pub fn deinit(ia: *InternArena, gpa: Allocator) void {
+    ia.map.deinit(gpa);
+    ia.items.deinit(gpa);
+    ia.extra.deinit(gpa);
+}
+
+pub fn indexToKey(ia: InternArena, index: Index) Key {
+    const data = ia.items.items(.data)[@enumToInt(index)];
+    return switch (ia.items.items(.tag)[@enumToInt(index)]) {
+        .type_int_signed => .{
+            .int_type = .{
+                .signedness = .signed,
+                .bits = @intCast(u16, data),
+            },
+        },
+        .type_int_unsigned => .{
+            .int_type = .{
+                .signedness = .unsigned,
+                .bits = @intCast(u16, data),
+            },
+        },
+        .type_array => {
+            const array_info = ia.extraData(Array, data);
+            return .{ .array_type = .{
+                .len = array_info.len,
+                .child = array_info.child,
+                .sentinel = .none,
+            } };
+        },
+        .simple => .{ .simple = @intToEnum(Simple, data) },
+
+        else => @panic("TODO"),
+    };
+}
+
+pub fn get(ia: *InternArena, gpa: Allocator, key: Key) Allocator.Error!Index {
+    const adapter: KeyAdapter = .{ .intern_arena = ia };
+    const gop = try ia.map.getOrPutAdapted(gpa, key, adapter);
+    if (gop.found_existing) {
+        return @intToEnum(Index, gop.index);
+    }
+    switch (key) {
+        .int_type => |int_type| {
+            const tag: Tag = switch (int_type.signedness) {
+                .signed => .type_int_signed,
+                .unsigned => .type_int_unsigned,
+            };
+            try ia.items.append(gpa, .{
+                .tag = tag,
+                .data = int_type.bits,
+            });
+        },
+        .array_type => |array_type| {
+            const len = @intCast(u32, array_type.len); // TODO have a big_array encoding
+            assert(array_type.sentinel == .none); // TODO have a sentinel_array encoding
+            try ia.items.append(gpa, .{
+                .tag = .type_array,
+                .data = try ia.addExtra(gpa, Array{
+                    .len = len,
+                    .child = array_type.child,
+                }),
+            });
+        },
+        else => @panic("TODO"),
+    }
+    return @intToEnum(Index, ia.items.len - 1);
+}
+
+fn addExtra(ia: *InternArena, gpa: Allocator, extra: anytype) Allocator.Error!u32 {
+    const fields = std.meta.fields(@TypeOf(extra));
+    try ia.extra.ensureUnusedCapacity(gpa, fields.len);
+    return ia.addExtraAssumeCapacity(extra);
+}
+
+fn addExtraAssumeCapacity(ia: *InternArena, extra: anytype) u32 {
+    const fields = std.meta.fields(@TypeOf(extra));
+    const result = @intCast(u32, ia.extra.items.len);
+    inline for (fields) |field| {
+        ia.extra.appendAssumeCapacity(switch (field.field_type) {
+            u32 => @field(extra, field.name),
+            Index => @enumToInt(@field(extra, field.name)),
+            i32 => @bitCast(u32, @field(extra, field.name)),
+            else => @compileError("bad field type"),
+        });
+    }
+    return result;
+}
+
+fn extraData(ia: InternArena, comptime T: type, index: usize) T {
+    const fields = std.meta.fields(T);
+    var i: usize = index;
+    var result: T = undefined;
+    inline for (fields) |field| {
+        @field(result, field.name) = switch (field.field_type) {
+            u32 => ia.extra.items[i],
+            Index => @intToEnum(Index, ia.extra.items[i]),
+            i32 => @bitCast(i32, ia.extra.items[i]),
+            else => @compileError("bad field type"),
+        };
+        i += 1;
+    }
+    return result;
+}
+
+test "basic usage" {
+    const gpa = std.testing.allocator;
+
+    var ia: InternArena = .{};
+    defer ia.deinit(gpa);
+
+    const i32_type = try ia.get(gpa, .{ .int_type = .{
+        .signedness = .signed,
+        .bits = 32,
+    } });
+    const array_i32 = try ia.get(gpa, .{ .array_type = .{
+        .len = 10,
+        .child = i32_type,
+        .sentinel = .none,
+    } });
+
+    const another_i32_type = try ia.get(gpa, .{ .int_type = .{
+        .signedness = .signed,
+        .bits = 32,
+    } });
+    try std.testing.expect(another_i32_type == i32_type);
+
+    const another_array_i32 = try ia.get(gpa, .{ .array_type = .{
+        .len = 10,
+        .child = i32_type,
+        .sentinel = .none,
+    } });
+    try std.testing.expect(another_array_i32 == array_i32);
+}