Commit 818fbd9c56

Andrew Kelley <andrew@ziglang.org>
2022-05-24 09:10:56
stage2: string literal interning
This is a temporary addition to stage2 in order to match stage1 behavior, however the end-game once the lang spec is settled will be to use a global InternPool for comptime memoized objects, making this behavior consistent across all types, not only string literals. Or, we might decide to not guarantee string literals to have equal comptime pointers, in which case this commit can be reverted.
1 parent 8171972
src/codegen/llvm.zig
@@ -2936,9 +2936,39 @@ pub const DeclGen = struct {
                     return dg.context.constString(
                         bytes.ptr,
                         @intCast(c_uint, tv.ty.arrayLenIncludingSentinel()),
-                        .True, // don't null terminate. bytes has the sentinel, if any.
+                        .True, // Don't null terminate. Bytes has the sentinel, if any.
                     );
                 },
+                .str_lit => {
+                    const str_lit = tv.val.castTag(.str_lit).?.data;
+                    const bytes = dg.module.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
+                    if (tv.ty.sentinel()) |sent_val| {
+                        const byte = @intCast(u8, sent_val.toUnsignedInt(target));
+                        if (byte == 0 and bytes.len > 0) {
+                            return dg.context.constString(
+                                bytes.ptr,
+                                @intCast(c_uint, bytes.len),
+                                .False, // Yes, null terminate.
+                            );
+                        }
+                        var array = std.ArrayList(u8).init(dg.gpa);
+                        defer array.deinit();
+                        try array.ensureUnusedCapacity(bytes.len + 1);
+                        array.appendSliceAssumeCapacity(bytes);
+                        array.appendAssumeCapacity(byte);
+                        return dg.context.constString(
+                            array.items.ptr,
+                            @intCast(c_uint, array.items.len),
+                            .True, // Don't null terminate.
+                        );
+                    } else {
+                        return dg.context.constString(
+                            bytes.ptr,
+                            @intCast(c_uint, bytes.len),
+                            .True, // Don't null terminate. `bytes` has the sentinel, if any.
+                        );
+                    }
+                },
                 .aggregate => {
                     const elem_vals = tv.val.castTag(.aggregate).?.data;
                     const elem_ty = tv.ty.elemType();
src/codegen.zig
@@ -203,11 +203,23 @@ pub fn generateSymbol(
         },
         .Array => switch (typed_value.val.tag()) {
             .bytes => {
-                const payload = typed_value.val.castTag(.bytes).?;
+                const bytes = typed_value.val.castTag(.bytes).?.data;
                 const len = @intCast(usize, typed_value.ty.arrayLenIncludingSentinel());
                 // The bytes payload already includes the sentinel, if any
                 try code.ensureUnusedCapacity(len);
-                code.appendSliceAssumeCapacity(payload.data[0..len]);
+                code.appendSliceAssumeCapacity(bytes[0..len]);
+                return Result{ .appended = {} };
+            },
+            .str_lit => {
+                const str_lit = typed_value.val.castTag(.str_lit).?.data;
+                const mod = bin_file.options.module.?;
+                const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
+                try code.ensureUnusedCapacity(bytes.len + 1);
+                code.appendSliceAssumeCapacity(bytes);
+                if (typed_value.ty.sentinel()) |sent_val| {
+                    const byte = @intCast(u8, sent_val.toUnsignedInt(target));
+                    code.appendAssumeCapacity(byte);
+                }
                 return Result{ .appended = {} };
             },
             .aggregate => {
src/Module.zig
@@ -70,6 +70,17 @@ import_table: std.StringArrayHashMapUnmanaged(*File) = .{},
 /// Keys are fully resolved file paths. This table owns the keys and values.
 embed_table: std.StringHashMapUnmanaged(*EmbedFile) = .{},
 
+/// This is a temporary addition to stage2 in order to match stage1 behavior,
+/// however the end-game once the lang spec is settled will be to use a global
+/// InternPool for comptime memoized objects, making this behavior consistent across all types,
+/// not only string literals. Or, we might decide to not guarantee string literals
+/// to have equal comptime pointers, in which case this field can be deleted (perhaps
+/// the commit that introduced it can simply be reverted).
+/// This table uses an optional index so that when a Decl is destroyed, the string literal
+/// is still reclaimable by a future Decl.
+string_literal_table: std.HashMapUnmanaged(StringLiteralContext.Key, Decl.OptionalIndex, StringLiteralContext, std.hash_map.default_max_load_percentage) = .{},
+string_literal_bytes: std.ArrayListUnmanaged(u8) = .{},
+
 /// The set of all the generic function instantiations. This is used so that when a generic
 /// function is called twice with the same comptime parameter arguments, both calls dispatch
 /// to the same function.
@@ -157,6 +168,39 @@ decls_free_list: std.ArrayListUnmanaged(Decl.Index) = .{},
 
 global_assembly: std.AutoHashMapUnmanaged(Decl.Index, []u8) = .{},
 
+pub const StringLiteralContext = struct {
+    bytes: *std.ArrayListUnmanaged(u8),
+
+    pub const Key = struct {
+        index: u32,
+        len: u32,
+    };
+
+    pub fn eql(self: @This(), a: Key, b: Key) bool {
+        _ = self;
+        return a.index == b.index and a.len == b.len;
+    }
+
+    pub fn hash(self: @This(), x: Key) u64 {
+        const x_slice = self.bytes.items[x.index..][0..x.len];
+        return std.hash_map.hashString(x_slice);
+    }
+};
+
+pub const StringLiteralAdapter = struct {
+    bytes: *std.ArrayListUnmanaged(u8),
+
+    pub fn eql(self: @This(), a_slice: []const u8, b: StringLiteralContext.Key) bool {
+        const b_slice = self.bytes.items[b.index..][0..b.len];
+        return mem.eql(u8, a_slice, b_slice);
+    }
+
+    pub fn hash(self: @This(), adapted_key: []const u8) u64 {
+        _ = self;
+        return std.hash_map.hashString(adapted_key);
+    }
+};
+
 const MonomorphedFuncsSet = std.HashMapUnmanaged(
     *Fn,
     void,
@@ -507,7 +551,8 @@ pub const Decl = struct {
         decl.name = undefined;
     }
 
-    pub fn clearValues(decl: *Decl, gpa: Allocator) void {
+    pub fn clearValues(decl: *Decl, mod: *Module) void {
+        const gpa = mod.gpa;
         if (decl.getExternFn()) |extern_fn| {
             extern_fn.deinit(gpa);
             gpa.destroy(extern_fn);
@@ -521,6 +566,13 @@ pub const Decl = struct {
             gpa.destroy(variable);
         }
         if (decl.value_arena) |arena_state| {
+            if (decl.owns_tv) {
+                if (decl.val.castTag(.str_lit)) |str_lit| {
+                    mod.string_literal_table.getPtrContext(str_lit.data, .{
+                        .bytes = &mod.string_literal_bytes,
+                    }).?.* = .none;
+                }
+            }
             arena_state.promote(gpa).deinit();
             decl.value_arena = null;
             decl.has_tv = false;
@@ -2839,6 +2891,9 @@ pub fn deinit(mod: *Module) void {
     mod.decls_free_list.deinit(gpa);
     mod.allocated_decls.deinit(gpa);
     mod.global_assembly.deinit(gpa);
+
+    mod.string_literal_table.deinit(gpa);
+    mod.string_literal_bytes.deinit(gpa);
 }
 
 pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
@@ -2857,7 +2912,7 @@ pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
             if (decl.getInnerNamespace()) |namespace| {
                 namespace.destroyDecls(mod);
             }
-            decl.clearValues(gpa);
+            decl.clearValues(mod);
         }
         decl.dependants.deinit(gpa);
         decl.dependencies.deinit(gpa);
@@ -4034,7 +4089,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool {
                 if (decl.getFunction()) |prev_func| {
                     prev_is_inline = prev_func.state == .inline_only;
                 }
-                decl.clearValues(gpa);
+                decl.clearValues(mod);
             }
 
             decl.ty = try decl_tv.ty.copy(decl_arena_allocator);
@@ -4080,7 +4135,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool {
     var type_changed = true;
     if (decl.has_tv) {
         type_changed = !decl.ty.eql(decl_tv.ty, mod);
-        decl.clearValues(gpa);
+        decl.clearValues(mod);
     }
 
     decl.owns_tv = false;
@@ -4694,7 +4749,7 @@ pub fn clearDecl(
         if (decl.getInnerNamespace()) |namespace| {
             try namespace.deleteAllDecls(mod, outdated_decls);
         }
-        decl.clearValues(gpa);
+        decl.clearValues(mod);
     }
 
     if (decl.deletion_flag) {
@@ -5623,7 +5678,7 @@ pub fn populateTestFunctions(mod: *Module) !void {
 
         // Since we are replacing the Decl's value we must perform cleanup on the
         // previous value.
-        decl.clearValues(gpa);
+        decl.clearValues(mod);
         decl.ty = new_ty;
         decl.val = new_val;
         decl.has_tv = true;
src/Sema.zig
@@ -3842,20 +3842,44 @@ fn zirStr(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Ins
 fn addStrLit(sema: *Sema, block: *Block, zir_bytes: []const u8) CompileError!Air.Inst.Ref {
     // `zir_bytes` references memory inside the ZIR module, which can get deallocated
     // after semantic analysis is complete, for example in the case of the initialization
-    // expression of a variable declaration. We need the memory to be in the new
-    // anonymous Decl's arena.
-    var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded);
-    defer anon_decl.deinit();
+    // expression of a variable declaration.
+    const mod = sema.mod;
+    const gpa = sema.gpa;
+    const string_bytes = &mod.string_literal_bytes;
+    const StringLiteralAdapter = Module.StringLiteralAdapter;
+    const StringLiteralContext = Module.StringLiteralContext;
+    try string_bytes.ensureUnusedCapacity(gpa, zir_bytes.len);
+    const gop = try mod.string_literal_table.getOrPutContextAdapted(gpa, zir_bytes, StringLiteralAdapter{
+        .bytes = string_bytes,
+    }, StringLiteralContext{
+        .bytes = string_bytes,
+    });
+    if (!gop.found_existing) {
+        gop.key_ptr.* = .{
+            .index = @intCast(u32, string_bytes.items.len),
+            .len = @intCast(u32, zir_bytes.len),
+        };
+        string_bytes.appendSliceAssumeCapacity(zir_bytes);
+        gop.value_ptr.* = .none;
+    }
+    const decl_index = gop.value_ptr.unwrap() orelse di: {
+        var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded);
+        defer anon_decl.deinit();
 
-    const bytes = try anon_decl.arena().dupeZ(u8, zir_bytes);
+        const decl_index = try anon_decl.finish(
+            try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), gop.key_ptr.len),
+            try Value.Tag.str_lit.create(anon_decl.arena(), gop.key_ptr.*),
+            0, // default alignment
+        );
 
-    const new_decl = try anon_decl.finish(
-        try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), bytes.len),
-        try Value.Tag.bytes.create(anon_decl.arena(), bytes[0 .. bytes.len + 1]),
-        0, // default alignment
-    );
+        // Needed so that `Decl.clearValues` will additionally set the corresponding
+        // string literal table value back to `Decl.OptionalIndex.none`.
+        mod.declPtr(decl_index).owns_tv = true;
 
-    return sema.analyzeDeclRef(new_decl);
+        gop.value_ptr.* = decl_index.toOptional();
+        break :di decl_index;
+    };
+    return sema.analyzeDeclRef(decl_index);
 }
 
 fn zirInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -19762,6 +19786,35 @@ fn beginComptimePtrMutation(
                                 .ty = elem_ty,
                             };
                         },
+                        .str_lit => {
+                            // An array is memory-optimized to store a slice of bytes, but we are about
+                            // to modify an individual field and the representation has to change.
+                            // If we wanted to avoid this, there would need to be special detection
+                            // elsewhere to identify when writing a value to an array element that is stored
+                            // using the `str_lit` tag, and handle it without making a call to this function.
+                            const arena = parent.beginArena(sema.mod);
+                            defer parent.finishArena(sema.mod);
+
+                            const str_lit = parent.val.castTag(.str_lit).?.data;
+                            const dest_len = parent.ty.arrayLenIncludingSentinel();
+                            const bytes = sema.mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
+                            const elems = try arena.alloc(Value, @intCast(usize, dest_len));
+                            for (bytes) |byte, i| {
+                                elems[i] = try Value.Tag.int_u64.create(arena, byte);
+                            }
+                            if (parent.ty.sentinel()) |sent_val| {
+                                assert(elems.len == bytes.len + 1);
+                                elems[bytes.len] = sent_val;
+                            }
+
+                            parent.val.* = try Value.Tag.aggregate.create(arena, elems);
+
+                            return ComptimePtrMutationKit{
+                                .decl_ref_mut = parent.decl_ref_mut,
+                                .val = &elems[elem_ptr.index],
+                                .ty = elem_ty,
+                            };
+                        },
                         .repeated => {
                             // An array is memory-optimized to store only a single element value, and
                             // that value is understood to be the same for the entire length of the array.
@@ -20097,10 +20150,23 @@ fn beginComptimePtrLoad(
                 }
             }
 
-            deref.pointee = if (elem_ptr.index < check_len) TypedValue{
+            if (elem_ptr.index >= check_len) {
+                deref.pointee = null;
+                break :blk deref;
+            }
+            if (elem_ptr.index == check_len - 1) {
+                if (array_tv.ty.sentinel()) |sent| {
+                    deref.pointee = TypedValue{
+                        .ty = elem_ty,
+                        .val = sent,
+                    };
+                    break :blk deref;
+                }
+            }
+            deref.pointee = TypedValue{
                 .ty = elem_ty,
                 .val = try array_tv.val.elemValue(sema.mod, sema.arena, elem_ptr.index),
-            } else null;
+            };
             break :blk deref;
         },
 
src/TypedValue.zig
@@ -295,6 +295,11 @@ pub fn print(
             return writer.print(".{s}", .{ty.enumFieldName(val.castTag(.enum_field_index).?.data)});
         },
         .bytes => return writer.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}),
+        .str_lit => {
+            const str_lit = val.castTag(.str_lit).?.data;
+            const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
+            return writer.print("\"{}\"", .{std.zig.fmtEscapes(bytes)});
+        },
         .repeated => {
             if (level == 0) {
                 return writer.writeAll(".{ ... }");
src/value.zig
@@ -126,6 +126,8 @@ pub const Value = extern union {
         field_ptr,
         /// A slice of u8 whose memory is managed externally.
         bytes,
+        /// Similar to bytes however it stores an index relative to `Module.string_literal_bytes`.
+        str_lit,
         /// This value is repeated some number of times. The amount of times to repeat
         /// is stored externally.
         repeated,
@@ -285,6 +287,7 @@ pub const Value = extern union {
                 .enum_literal,
                 => Payload.Bytes,
 
+                .str_lit => Payload.StrLit,
                 .slice => Payload.Slice,
 
                 .enum_field_index => Payload.U32,
@@ -538,6 +541,7 @@ pub const Value = extern union {
                 };
                 return Value{ .ptr_otherwise = &new_payload.base };
             },
+            .str_lit => return self.copyPayloadShallow(arena, Payload.StrLit),
             .repeated,
             .eu_payload,
             .opt_payload,
@@ -764,6 +768,12 @@ pub const Value = extern union {
             .enum_literal => return out_stream.print(".{}", .{std.zig.fmtId(val.castTag(.enum_literal).?.data)}),
             .enum_field_index => return out_stream.print("(enum field {d})", .{val.castTag(.enum_field_index).?.data}),
             .bytes => return out_stream.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}),
+            .str_lit => {
+                const str_lit = val.castTag(.str_lit).?.data;
+                return out_stream.print("(.str_lit index={d} len={d})", .{
+                    str_lit.index, str_lit.len,
+                });
+            },
             .repeated => {
                 try out_stream.writeAll("(repeated) ");
                 val = val.castTag(.repeated).?.data;
@@ -824,6 +834,11 @@ pub const Value = extern union {
                 const adjusted_bytes = bytes[0..adjusted_len];
                 return allocator.dupe(u8, adjusted_bytes);
             },
+            .str_lit => {
+                const str_lit = val.castTag(.str_lit).?.data;
+                const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
+                return allocator.dupe(u8, bytes);
+            },
             .enum_literal => return allocator.dupe(u8, val.castTag(.enum_literal).?.data),
             .repeated => {
                 const byte = @intCast(u8, val.castTag(.repeated).?.data.toUnsignedInt(target));
@@ -2537,6 +2552,20 @@ pub const Value = extern union {
                     return initPayload(&buffer.base);
                 }
             },
+            .str_lit => {
+                const str_lit = val.castTag(.str_lit).?.data;
+                const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len];
+                const byte = bytes[index];
+                if (arena) |a| {
+                    return Tag.int_u64.create(a, byte);
+                } else {
+                    buffer.* = .{
+                        .base = .{ .tag = .int_u64 },
+                        .data = byte,
+                    };
+                    return initPayload(&buffer.base);
+                }
+            },
 
             // No matter the index; all the elements are the same!
             .repeated => return val.castTag(.repeated).?.data,
@@ -2570,6 +2599,13 @@ pub const Value = extern union {
         return switch (val.tag()) {
             .empty_array_sentinel => if (start == 0 and end == 1) val else Value.initTag(.empty_array),
             .bytes => Tag.bytes.create(arena, val.castTag(.bytes).?.data[start..end]),
+            .str_lit => {
+                const str_lit = val.castTag(.str_lit).?.data;
+                return Tag.str_lit.create(arena, .{
+                    .index = @intCast(u32, str_lit.index + start),
+                    .len = @intCast(u32, end - start),
+                });
+            },
             .aggregate => Tag.aggregate.create(arena, val.castTag(.aggregate).?.data[start..end]),
             .slice => sliceArray(val.castTag(.slice).?.data.ptr, mod, arena, start, end),
 
@@ -4721,6 +4757,11 @@ pub const Value = extern union {
             data: []const u8,
         };
 
+        pub const StrLit = struct {
+            base: Payload,
+            data: Module.StringLiteralContext.Key,
+        };
+
         pub const Aggregate = struct {
             base: Payload,
             /// Field values. The types are according to the struct or array type.
test/behavior/eval.zig
@@ -643,9 +643,18 @@ fn assertEqualPtrs(ptr1: *const u8, ptr2: *const u8) !void {
     try expect(ptr1 == ptr2);
 }
 
+// This one is still up for debate in the language specification.
+// Application code should not rely on this behavior until it is solidified.
+// Currently, stage1 has special case code to make this pass for string literals
+// but it does not work if the values are constructed with comptime code, or if
+// arrays of non-u8 elements are used instead.
+// The official language specification might not make this guarantee. However, if
+// it does make this guarantee, it will make it consistently for all types, not
+// only string literals. This is why stage2 currently has a string table for
+// string literals, to match stage1 and pass this test, however the end-game once
+// the lang spec issue is settled would be to use a global InternPool for comptime
+// memoized objects, making this behavior consistent across all types.
 test "string literal used as comptime slice is memoized" {
-    if (builtin.zig_backend != .stage1) return error.SkipZigTest; // TODO
-
     const a = "link";
     const b = "link";
     comptime try expect(TypeWithCompTimeSlice(a).Node == TypeWithCompTimeSlice(b).Node);