Commit cf059ee087

Will Lillis <will.lillis24@gmail.com>
2025-02-05 10:10:11
AstGen: improve error for invalid bytes in strings and comments
1 parent d72f3d3
lib/std/zig/Ast.zig
@@ -458,6 +458,19 @@ pub fn renderError(tree: Ast, parse_error: Error, stream: anytype) !void {
             return stream.writeAll("for input is not captured");
         },
 
+        .invalid_byte => {
+            const tok_slice = tree.source[tree.tokens.items(.start)[parse_error.token]..];
+            return stream.print("{s} contains invalid byte: '{'}'", .{
+                switch (tok_slice[0]) {
+                    '\'' => "character literal",
+                    '"', '\\' => "string literal",
+                    '/' => "comment",
+                    else => unreachable,
+                },
+                std.zig.fmtEscapes(tok_slice[parse_error.extra.offset..][0..1]),
+            });
+        },
+
         .expected_token => {
             const found_tag = token_tags[parse_error.token + @intFromBool(parse_error.token_is_prev)];
             const expected_symbol = parse_error.extra.expected_tag.symbol();
@@ -2926,6 +2939,7 @@ pub const Error = struct {
     extra: union {
         none: void,
         expected_tag: Token.Tag,
+        offset: usize,
     } = .{ .none = {} },
 
     pub const Tag = enum {
@@ -2996,6 +3010,9 @@ pub const Error = struct {
 
         /// `expected_tag` is populated.
         expected_token,
+
+        /// `offset` is populated
+        invalid_byte,
     };
 };
 
lib/std/zig/AstGen.zig
@@ -14017,6 +14017,39 @@ fn lowerAstErrors(astgen: *AstGen) !void {
     var notes: std.ArrayListUnmanaged(u32) = .empty;
     defer notes.deinit(gpa);
 
+    const token_starts = tree.tokens.items(.start);
+    const token_tags = tree.tokens.items(.tag);
+    const parse_err = tree.errors[0];
+    const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
+    const tok_start = token_starts[tok];
+    const start_char = tree.source[tok_start];
+
+    if (token_tags[tok] == .invalid and
+        (start_char == '\"' or start_char == '\'' or start_char == '/' or mem.startsWith(u8, tree.source[tok_start..], "\\\\")))
+    {
+        const tok_len: u32 = @intCast(tree.tokenSlice(tok).len);
+        const tok_end = tok_start + tok_len;
+        const bad_off = blk: {
+            var idx = tok_start;
+            while (idx < tok_end) : (idx += 1) {
+                switch (tree.source[idx]) {
+                    0x00...0x09, 0x0b...0x1f, 0x7f => break,
+                    else => {},
+                }
+            }
+            break :blk idx - tok_start;
+        };
+
+        const err: Ast.Error = .{
+            .tag = Ast.Error.Tag.invalid_byte,
+            .token = tok,
+            .extra = .{ .offset = bad_off },
+        };
+        msg.clearRetainingCapacity();
+        try tree.renderError(err, msg.writer(gpa));
+        return try astgen.appendErrorTokNotesOff(tok, bad_off, "{s}", .{msg.items}, notes.items);
+    }
+
     var cur_err = tree.errors[0];
     for (tree.errors[1..]) |err| {
         if (err.is_note) {
test/cases/compile_errors/normal_string_with_newline.zig
@@ -5,4 +5,4 @@ b";
 // backend=stage2
 // target=native
 //
-// :1:13: error: expected expression, found 'invalid token'
+// :1:15: error: string literal contains invalid byte: '\n'
test/cases/compile_errors/tab_inside_comment.zig
@@ -0,0 +1,8 @@
+// Some		comment
+export fn entry() void {}
+
+// error
+// backend=stage2
+// target=native
+//
+// :1:8: error: comment contains invalid byte: '\t'
test/cases/compile_errors/tab_inside_doc_comment.zig
@@ -0,0 +1,8 @@
+/// Some doc		comment
+export fn entry() void {}
+
+// error
+// backend=stage2
+// target=native
+//
+// :1:13: error: comment contains invalid byte: '\t'
test/cases/compile_errors/tab_inside_multiline_string.zig
@@ -0,0 +1,13 @@
+export fn entry() void {
+    const foo =
+        \\const S = struct {
+        \\	// hello
+        \\}
+    ;
+    _ = foo;
+}
+// error
+// backend=stage2
+// target=native
+//
+// :4:11: error: string literal contains invalid byte: '\t'
test/cases/compile_errors/tab_inside_string.zig
@@ -0,0 +1,10 @@
+export fn entry() void {
+    const foo = "	hello";
+    _ = foo;
+}
+
+// error
+// backend=stage2
+// target=native
+//
+// :2:18: error: string literal contains invalid byte: '\t'
test/compile_errors.zig
@@ -217,7 +217,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
         const case = ctx.obj("invalid byte in string", b.graph.host);
 
         case.addError("_ = \"\x01Q\";", &[_][]const u8{
-            ":1:5: error: expected expression, found 'invalid token'",
+            ":1:6: error: string literal contains invalid byte: '\\x01'",
         });
     }
 
@@ -225,7 +225,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
         const case = ctx.obj("invalid byte in comment", b.graph.host);
 
         case.addError("//\x01Q", &[_][]const u8{
-            ":1:1: error: expected type expression, found 'invalid token'",
+            ":1:3: error: comment contains invalid byte: '\\x01'",
         });
     }
 
@@ -233,7 +233,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
         const case = ctx.obj("control character in character literal", b.graph.host);
 
         case.addError("const c = '\x01';", &[_][]const u8{
-            ":1:11: error: expected expression, found 'invalid token'",
+            ":1:12: error: character literal contains invalid byte: '\\x01'",
         });
     }