Commit 377e8579f9
Changed files (12)
lib/std/zig/Ast.zig
@@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
const token = tokenizer.next();
try tokens.append(gpa, .{
.tag = token.tag,
- .start = @as(u32, @intCast(token.loc.start)),
+ .start = @intCast(token.loc.start),
});
if (token.tag == .eof) break;
}
lib/std/zig/AstGen.zig
@@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token
.{raw_string[bad_index]},
);
},
+ .empty_char_literal => {
+ return astgen.failOff(token, offset, "empty character literal", .{});
+ },
}
}
@@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void {
var msg: std.ArrayListUnmanaged(u8) = .{};
defer msg.deinit(gpa);
- const token_starts = tree.tokens.items(.start);
- const token_tags = tree.tokens.items(.tag);
-
var notes: std.ArrayListUnmanaged(u32) = .{};
defer notes.deinit(gpa);
- const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
- if (token_tags[tok] == .invalid) {
- const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
- const byte_abs = token_starts[tok] + bad_off;
- try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
- std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
- }));
- }
-
for (tree.errors[1..]) |note| {
if (!note.is_note) break;
lib/std/zig/parser_test.zig
@@ -6061,7 +6061,6 @@ test "recovery: invalid container members" {
, &[_]Error{
.expected_expr,
.expected_comma_after_field,
- .expected_type_expr,
.expected_semi_after_stmt,
});
}
lib/std/zig/string_literal.zig
@@ -1,6 +1,5 @@
const std = @import("../std.zig");
const assert = std.debug.assert;
-const utf8Decode = std.unicode.utf8Decode;
const utf8Encode = std.unicode.utf8Encode;
pub const ParseError = error{
@@ -37,12 +36,16 @@ pub const Error = union(enum) {
expected_single_quote: usize,
/// The character at this index cannot be represented without an escape sequence.
invalid_character: usize,
+ /// `''`. Not returned for string literals.
+ empty_char_literal,
};
-/// Only validates escape sequence characters.
-/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
+/// Asserts the slice starts and ends with single-quotes.
+/// Returns an error if there is not exactly one UTF-8 codepoint in between.
pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
- assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
+ if (slice.len < 3) return .{ .failure = .empty_char_literal };
+ assert(slice[0] == '\'');
+ assert(slice[slice.len - 1] == '\'');
switch (slice[1]) {
'\\' => {
@@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
},
0 => return .{ .failure = .{ .invalid_character = 1 } },
else => {
- const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
+ const inner = slice[1 .. slice.len - 1];
+ const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{
+ .failure = .{ .invalid_unicode_codepoint = 1 },
+ };
+ if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } };
+ const codepoint = switch (n) {
+ 1 => inner[0],
+ 2 => std.unicode.utf8Decode2(inner[0..2].*),
+ 3 => std.unicode.utf8Decode3(inner[0..3].*),
+ 4 => std.unicode.utf8Decode4(inner[0..4].*),
+ else => unreachable,
+ } catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } };
return .{ .success = codepoint };
},
}
lib/std/zig/tokenizer.zig
@@ -320,7 +320,7 @@ pub const Token = struct {
pub fn symbol(tag: Tag) []const u8 {
return tag.lexeme() orelse switch (tag) {
- .invalid => "invalid bytes",
+ .invalid => "invalid token",
.identifier => "an identifier",
.string_literal, .multiline_string_literal_line => "a string literal",
.char_literal => "a character literal",
@@ -338,22 +338,22 @@ pub const Tokenizer = struct {
buffer: [:0]const u8,
index: usize,
- /// For debugging purposes
+ /// For debugging purposes.
pub fn dump(self: *Tokenizer, token: *const Token) void {
std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] });
}
pub fn init(buffer: [:0]const u8) Tokenizer {
- // Skip the UTF-8 BOM if present
- const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0;
- return Tokenizer{
+ // Skip the UTF-8 BOM if present.
+ return .{
.buffer = buffer,
- .index = src_start,
+ .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
};
}
const State = enum {
start,
+ expect_newline,
identifier,
builtin,
string_literal,
@@ -361,10 +361,6 @@ pub const Tokenizer = struct {
multiline_string_literal_line,
char_literal,
char_literal_backslash,
- char_literal_hex_escape,
- char_literal_unicode_escape_saw_u,
- char_literal_unicode_escape,
- char_literal_end,
backslash,
equal,
bang,
@@ -400,32 +396,38 @@ pub const Tokenizer = struct {
period_2,
period_asterisk,
saw_at_sign,
+ invalid,
};
+ /// After this returns invalid, it will reset on the next newline, returning tokens starting from there.
+ /// An eof token will always be returned at the end.
pub fn next(self: *Tokenizer) Token {
var state: State = .start;
- var result = Token{
- .tag = .eof,
+ var result: Token = .{
+ .tag = undefined,
.loc = .{
.start = self.index,
.end = undefined,
},
};
- var seen_escape_digits: usize = undefined;
while (true) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
.start => switch (c) {
0 => {
- if (self.index != self.buffer.len) {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += 1;
- return result;
- }
- break;
- },
- ' ', '\n', '\t', '\r' => {
+ if (self.index == self.buffer.len) return .{
+ .tag = .eof,
+ .loc = .{
+ .start = self.index,
+ .end = self.index,
+ },
+ };
+ state = .invalid;
+ },
+ '\r' => {
+ state = .expect_newline;
+ },
+ ' ', '\n', '\t' => {
result.loc.start = self.index + 1;
},
'"' => {
@@ -434,6 +436,7 @@ pub const Tokenizer = struct {
},
'\'' => {
state = .char_literal;
+ result.tag = .char_literal;
},
'a'...'z', 'A'...'Z', '_' => {
state = .identifier;
@@ -545,14 +548,37 @@ pub const Tokenizer = struct {
result.tag = .number_literal;
},
else => {
+ state = .invalid;
+ },
+ },
+
+ .expect_newline => switch (c) {
+ '\n' => {
+ result.loc.start = self.index + 1;
+ state = .start;
+ },
+ else => {
+ state = .invalid;
+ },
+ },
+
+ .invalid => switch (c) {
+ 0 => if (self.index == self.buffer.len) {
+ result.tag = .invalid;
+ break;
+ },
+ '\n' => {
result.tag = .invalid;
- result.loc.end = self.index;
- self.index += std.unicode.utf8ByteSequenceLength(c) catch 1;
- return result;
+ break;
},
+ else => continue,
},
.saw_at_sign => switch (c) {
+ 0, '\n' => {
+ result.tag = .invalid;
+ break;
+ },
'"' => {
result.tag = .identifier;
state = .string_literal;
@@ -562,8 +588,7 @@ pub const Tokenizer = struct {
result.tag = .builtin;
},
else => {
- result.tag = .invalid;
- break;
+ state = .invalid;
},
},
@@ -698,7 +723,7 @@ pub const Tokenizer = struct {
},
.identifier => switch (c) {
- 'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
+ 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
else => {
if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
result.tag = tag;
@@ -707,26 +732,37 @@ pub const Tokenizer = struct {
},
},
.builtin => switch (c) {
- 'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
+ 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
else => break,
},
.backslash => switch (c) {
+ 0 => {
+ result.tag = .invalid;
+ break;
+ },
'\\' => {
state = .multiline_string_literal_line;
},
- else => {
+ '\n' => {
result.tag = .invalid;
break;
},
+ else => {
+ state = .invalid;
+ },
},
.string_literal => switch (c) {
- 0, '\n' => {
- result.tag = .invalid;
- result.loc.end = self.index;
+ 0 => {
if (self.index != self.buffer.len) {
- self.index += 1;
+ state = .invalid;
+ continue;
}
- return result;
+ result.tag = .invalid;
+ break;
+ },
+ '\n' => {
+ result.tag = .invalid;
+ break;
},
'\\' => {
state = .string_literal_backslash;
@@ -735,150 +771,74 @@ pub const Tokenizer = struct {
self.index += 1;
break;
},
- else => {
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+ 0x01...0x09, 0x0b...0x1f, 0x7f => {
+ state = .invalid;
},
+ else => continue,
},
.string_literal_backslash => switch (c) {
0, '\n' => {
result.tag = .invalid;
- result.loc.end = self.index;
- if (self.index != self.buffer.len) {
- self.index += 1;
- }
- return result;
+ break;
},
else => {
state = .string_literal;
-
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.char_literal => switch (c) {
- 0, '\n', '\'' => {
- result.tag = .invalid;
- result.loc.end = self.index;
+ 0 => {
if (self.index != self.buffer.len) {
- self.index += 1;
+ state = .invalid;
+ continue;
}
- return result;
+ result.tag = .invalid;
+ break;
+ },
+ '\n' => {
+ result.tag = .invalid;
+ break;
},
'\\' => {
state = .char_literal_backslash;
},
- else => {
- state = .char_literal_end;
-
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+ '\'' => {
+ self.index += 1;
+ break;
},
+ 0x01...0x09, 0x0b...0x1f, 0x7f => {
+ state = .invalid;
+ },
+ else => continue,
},
.char_literal_backslash => switch (c) {
- 0, '\n' => {
- result.tag = .invalid;
- result.loc.end = self.index;
+ 0 => {
if (self.index != self.buffer.len) {
- self.index += 1;
- }
- return result;
- },
- 'x' => {
- state = .char_literal_hex_escape;
- seen_escape_digits = 0;
- },
- 'u' => {
- state = .char_literal_unicode_escape_saw_u;
- },
- else => {
- state = .char_literal_end;
-
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
- },
- },
-
- .char_literal_hex_escape => switch (c) {
- '0'...'9', 'a'...'f', 'A'...'F' => {
- seen_escape_digits += 1;
- if (seen_escape_digits == 2) {
- state = .char_literal_end;
+ state = .invalid;
+ continue;
}
- },
- else => {
result.tag = .invalid;
break;
},
- },
-
- .char_literal_unicode_escape_saw_u => switch (c) {
- '{' => {
- state = .char_literal_unicode_escape;
- },
- else => {
- result.tag = .invalid;
- break;
- },
- },
-
- .char_literal_unicode_escape => switch (c) {
- '0'...'9', 'a'...'f', 'A'...'F' => {},
- '}' => {
- state = .char_literal_end; // too many/few digits handled later
- },
- else => {
+ '\n' => {
result.tag = .invalid;
break;
},
- },
-
- .char_literal_end => switch (c) {
- '\'' => {
- result.tag = .char_literal;
- self.index += 1;
- break;
+ 0x01...0x09, 0x0b...0x1f, 0x7f => {
+ state = .invalid;
},
else => {
- result.tag = .invalid;
- break;
+ state = .char_literal;
},
},
.multiline_string_literal_line => switch (c) {
0 => {
if (self.index != self.buffer.len) {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += 1;
- return result;
+ state = .invalid;
+ continue;
}
break;
},
@@ -886,17 +846,10 @@ pub const Tokenizer = struct {
self.index += 1;
break;
},
- '\t' => {},
- else => {
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+ 0x01...0x08, 0x0b...0x1f, 0x7f => {
+ state = .invalid;
},
+ else => continue,
},
.bang => switch (c) {
@@ -1113,12 +1066,16 @@ pub const Tokenizer = struct {
.line_comment_start => switch (c) {
0 => {
if (self.index != self.buffer.len) {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += 1;
- return result;
+ state = .invalid;
+ continue;
}
- break;
+ return .{
+ .tag = .eof,
+ .loc = .{
+ .start = self.index,
+ .end = self.index,
+ },
+ };
},
'/' => {
state = .doc_comment_start;
@@ -1127,105 +1084,74 @@ pub const Tokenizer = struct {
result.tag = .container_doc_comment;
state = .doc_comment;
},
+ '\r' => {
+ state = .expect_newline;
+ },
'\n' => {
state = .start;
result.loc.start = self.index + 1;
},
- '\t' => {
- state = .line_comment;
+ 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+ state = .invalid;
},
else => {
state = .line_comment;
-
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.doc_comment_start => switch (c) {
- '/' => {
- state = .line_comment;
- },
- 0 => {
- if (self.index != self.buffer.len) {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += 1;
- return result;
- }
+ 0, '\n', '\r' => {
result.tag = .doc_comment;
break;
},
- '\n' => {
- result.tag = .doc_comment;
- break;
+ '/' => {
+ state = .line_comment;
},
- '\t' => {
- state = .doc_comment;
- result.tag = .doc_comment;
+ 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+ state = .invalid;
},
else => {
state = .doc_comment;
result.tag = .doc_comment;
-
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.line_comment => switch (c) {
0 => {
if (self.index != self.buffer.len) {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += 1;
- return result;
+ state = .invalid;
+ continue;
}
- break;
+ return .{
+ .tag = .eof,
+ .loc = .{
+ .start = self.index,
+ .end = self.index,
+ },
+ };
+ },
+ '\r' => {
+ state = .expect_newline;
},
'\n' => {
state = .start;
result.loc.start = self.index + 1;
},
- '\t' => {},
- else => {
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+ 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+ state = .invalid;
},
+ else => continue,
},
.doc_comment => switch (c) {
- 0, '\n' => break,
- '\t' => {},
- else => {
- if (self.invalidCharacterLength()) |len| {
- result.tag = .invalid;
- result.loc.end = self.index;
- self.index += len;
- return result;
- }
-
- self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+ 0, '\n', '\r' => {
+ break;
},
+ 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+ state = .invalid;
+ },
+ else => continue,
},
.int => switch (c) {
'.' => state = .int_period,
- '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {},
+ '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
'e', 'E', 'p', 'P' => state = .int_exponent,
else => break,
},
@@ -1249,7 +1175,7 @@ pub const Tokenizer = struct {
},
},
.float => switch (c) {
- '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {},
+ '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
'e', 'E', 'p', 'P' => state = .float_exponent,
else => break,
},
@@ -1263,57 +1189,9 @@ pub const Tokenizer = struct {
}
}
- if (result.tag == .eof) {
- result.loc.start = self.index;
- }
-
result.loc.end = self.index;
return result;
}
-
- fn invalidCharacterLength(self: *Tokenizer) ?u3 {
- const c0 = self.buffer[self.index];
- if (std.ascii.isAscii(c0)) {
- if (c0 == '\r') {
- if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
- // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
- // they constitute an illegal byte!
- return null;
- } else {
- return 1;
- }
- } else if (std.ascii.isControl(c0)) {
- // ascii control codes are never allowed
- // (note that \n was checked before we got here)
- return 1;
- }
- // looks fine to me.
- return null;
- } else {
- // check utf8-encoded character.
- const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
- if (self.index + length > self.buffer.len) {
- return @as(u3, @intCast(self.buffer.len - self.index));
- }
- const bytes = self.buffer[self.index .. self.index + length];
- switch (length) {
- 2 => {
- const value = std.unicode.utf8Decode2(bytes) catch return length;
- if (value == 0x85) return length; // U+0085 (NEL)
- },
- 3 => {
- const value = std.unicode.utf8Decode3(bytes) catch return length;
- if (value == 0x2028) return length; // U+2028 (LS)
- if (value == 0x2029) return length; // U+2029 (PS)
- },
- 4 => {
- _ = std.unicode.utf8Decode4(bytes) catch return length;
- },
- else => unreachable,
- }
- return null;
- }
- }
};
test "keywords" {
@@ -1355,7 +1233,7 @@ test "code point literal with hex escape" {
, &.{.char_literal});
try testTokenize(
\\'\x1'
- , &.{ .invalid, .invalid });
+ , &.{.char_literal});
}
test "newline in char literal" {
@@ -1396,40 +1274,30 @@ test "code point literal with unicode escapes" {
// Invalid unicode escapes
try testTokenize(
\\'\u'
- , &.{ .invalid, .invalid });
+ , &.{.char_literal});
try testTokenize(
\\'\u{{'
- , &.{ .invalid, .l_brace, .invalid });
+ , &.{.char_literal});
try testTokenize(
\\'\u{}'
, &.{.char_literal});
try testTokenize(
\\'\u{s}'
- , &.{
- .invalid,
- .identifier,
- .r_brace,
- .invalid,
- });
+ , &.{.char_literal});
try testTokenize(
\\'\u{2z}'
- , &.{
- .invalid,
- .identifier,
- .r_brace,
- .invalid,
- });
+ , &.{.char_literal});
try testTokenize(
\\'\u{4a'
- , &.{ .invalid, .invalid }); // 4a is valid
+ , &.{.char_literal});
// Test old-style unicode literals
try testTokenize(
\\'\u0333'
- , &.{ .invalid, .number_literal, .invalid });
+ , &.{.char_literal});
try testTokenize(
\\'\U0333'
- , &.{ .invalid, .number_literal, .invalid });
+ , &.{.char_literal});
}
test "code point literal with unicode code point" {
@@ -1465,24 +1333,15 @@ test "invalid token characters" {
try testTokenize("`", &.{.invalid});
try testTokenize("'c", &.{.invalid});
try testTokenize("'", &.{.invalid});
- try testTokenize("''", &.{.invalid});
+ try testTokenize("''", &.{.char_literal});
try testTokenize("'\n'", &.{ .invalid, .invalid });
}
test "invalid literal/comment characters" {
- try testTokenize("\"\x00\"", &.{
- .invalid,
- .invalid, // Incomplete string literal starting after invalid
- });
- try testTokenize("//\x00", &.{
- .invalid,
- });
- try testTokenize("//\x1f", &.{
- .invalid,
- });
- try testTokenize("//\x7f", &.{
- .invalid,
- });
+ try testTokenize("\"\x00\"", &.{.invalid});
+ try testTokenize("//\x00", &.{.invalid});
+ try testTokenize("//\x1f", &.{.invalid});
+ try testTokenize("//\x7f", &.{.invalid});
}
test "utf8" {
@@ -1491,46 +1350,24 @@ test "utf8" {
}
test "invalid utf8" {
- try testTokenize("//\x80", &.{
- .invalid,
- });
- try testTokenize("//\xbf", &.{
- .invalid,
- });
- try testTokenize("//\xf8", &.{
- .invalid,
- });
- try testTokenize("//\xff", &.{
- .invalid,
- });
- try testTokenize("//\xc2\xc0", &.{
- .invalid,
- });
- try testTokenize("//\xe0", &.{
- .invalid,
- });
- try testTokenize("//\xf0", &.{
- .invalid,
- });
- try testTokenize("//\xf0\x90\x80\xc0", &.{
- .invalid,
- });
+ try testTokenize("//\x80", &.{});
+ try testTokenize("//\xbf", &.{});
+ try testTokenize("//\xf8", &.{});
+ try testTokenize("//\xff", &.{});
+ try testTokenize("//\xc2\xc0", &.{});
+ try testTokenize("//\xe0", &.{});
+ try testTokenize("//\xf0", &.{});
+ try testTokenize("//\xf0\x90\x80\xc0", &.{});
}
test "illegal unicode codepoints" {
// unicode newline characters.U+0085, U+2028, U+2029
try testTokenize("//\xc2\x84", &.{});
- try testTokenize("//\xc2\x85", &.{
- .invalid,
- });
+ try testTokenize("//\xc2\x85", &.{});
try testTokenize("//\xc2\x86", &.{});
try testTokenize("//\xe2\x80\xa7", &.{});
- try testTokenize("//\xe2\x80\xa8", &.{
- .invalid,
- });
- try testTokenize("//\xe2\x80\xa9", &.{
- .invalid,
- });
+ try testTokenize("//\xe2\x80\xa8", &.{});
+ try testTokenize("//\xe2\x80\xa9", &.{});
try testTokenize("//\xe2\x80\xaa", &.{});
}
@@ -1892,8 +1729,8 @@ test "multi line string literal with only 1 backslash" {
}
test "invalid builtin identifiers" {
- try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren });
- try testTokenize("@0()", &.{ .invalid, .number_literal, .l_paren, .r_paren });
+ try testTokenize("@()", &.{.invalid});
+ try testTokenize("@0()", &.{.invalid});
}
test "invalid token with unfinished escape right before eof" {
@@ -1921,12 +1758,12 @@ test "saturating operators" {
}
test "null byte before eof" {
- try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal });
+ try testTokenize("123 \x00 456", &.{ .number_literal, .invalid });
try testTokenize("//\x00", &.{.invalid});
try testTokenize("\\\\\x00", &.{.invalid});
try testTokenize("\x00", &.{.invalid});
try testTokenize("// NUL\x00\n", &.{.invalid});
- try testTokenize("///\x00\n", &.{.invalid});
+ try testTokenize("///\x00\n", &.{ .doc_comment, .invalid });
try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
}
@@ -1936,6 +1773,9 @@ fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !v
const token = tokenizer.next();
try std.testing.expectEqual(expected_token_tag, token.tag);
}
+ // Last token should always be eof, even when the last token was invalid,
+ // in which case the tokenizer is in an invalid state, which can only be
+ // recovered by opinionated means outside the scope of this implementation.
const last_token = tokenizer.next();
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
try std.testing.expectEqual(source.len, last_token.loc.start);
lib/std/unicode.zig
@@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [
const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
-/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
-/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
-/// If you already know the length at comptime, you can call one of
-/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
+/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
return switch (bytes.len) {
- 1 => @as(u21, bytes[0]),
- 2 => utf8Decode2(bytes),
- 3 => utf8Decode3(bytes),
- 4 => utf8Decode4(bytes),
+ 1 => bytes[0],
+ 2 => utf8Decode2(bytes[0..2].*),
+ 3 => utf8Decode3(bytes[0..3].*),
+ 4 => utf8Decode4(bytes[0..4].*),
else => unreachable,
};
}
@@ -113,8 +110,7 @@ const Utf8Decode2Error = error{
Utf8ExpectedContinuation,
Utf8OverlongEncoding,
};
-pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
- assert(bytes.len == 2);
+pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
assert(bytes[0] & 0b11100000 == 0b11000000);
var value: u21 = bytes[0] & 0b00011111;
@@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
Utf8EncodesSurrogateHalf,
};
-pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
+pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
const value = try utf8Decode3AllowSurrogateHalf(bytes);
if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
@@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{
Utf8ExpectedContinuation,
Utf8OverlongEncoding,
};
-pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 {
- assert(bytes.len == 3);
+pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
assert(bytes[0] & 0b11110000 == 0b11100000);
var value: u21 = bytes[0] & 0b00001111;
@@ -165,8 +160,7 @@ const Utf8Decode4Error = error{
Utf8OverlongEncoding,
Utf8CodepointTooLarge,
};
-pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
- assert(bytes.len == 4);
+pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
assert(bytes[0] & 0b11111000 == 0b11110000);
var value: u21 = bytes[0] & 0b00000111;
@@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {
const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;
+/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
return switch (bytes.len) {
- 1 => @as(u21, bytes[0]),
- 2 => utf8Decode2(bytes),
- 3 => utf8Decode3AllowSurrogateHalf(bytes),
- 4 => utf8Decode4(bytes),
+ 1 => bytes[0],
+ 2 => utf8Decode2(bytes[0..2].*),
+ 3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
+ 4 => utf8Decode4(bytes[0..4].*),
else => unreachable,
};
}
src/Package/Manifest.zig
@@ -549,6 +549,9 @@ const Parse = struct {
.{raw_string[bad_index]},
);
},
+ .empty_char_literal => {
+ try p.appendErrorOff(token, offset, "empty character literal", .{});
+ },
}
}
test/cases/compile_errors/empty_char_lit.zig
@@ -0,0 +1,9 @@
+export fn entry() u8 {
+ return '';
+}
+
+// error
+// backend=stage2
+// target=native
+//
+// :2:12: error: empty character literal
test/cases/compile_errors/invalid_legacy_unicode_escape.zig
@@ -6,5 +6,4 @@ export fn entry() void {
// backend=stage2
// target=native
//
-// :2:15: error: expected expression, found 'invalid bytes'
-// :2:18: note: invalid byte: '1'
+// :2:17: error: invalid escape character: 'U'
test/cases/compile_errors/invalid_unicode_escape.zig
@@ -6,6 +6,5 @@ export fn entry() void {
// backend=stage2
// target=native
//
-// :2:15: error: expected expression, found 'invalid bytes'
-// :2:21: note: invalid byte: 'z'
+// :2:21: error: expected hex digit or '}', found 'z'
test/cases/compile_errors/normal_string_with_newline.zig
@@ -5,5 +5,4 @@ b";
// backend=stage2
// target=native
//
-// :1:13: error: expected expression, found 'invalid bytes'
-// :1:15: note: invalid byte: '\n'
+// :1:13: error: expected expression, found 'invalid token'
test/compile_errors.zig
@@ -42,8 +42,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
- ":1:13: error: expected expression, found 'invalid bytes'",
- ":1:19: note: invalid byte: '\\r'",
+ ":1:13: error: expected expression, found 'invalid token'",
});
}
@@ -179,8 +178,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
\\ return true;
\\}
, &[_][]const u8{
- ":1:1: error: expected type expression, found 'invalid bytes'",
- ":1:1: note: invalid byte: '\\xff'",
+ ":1:1: error: expected type expression, found 'invalid token'",
});
}
@@ -222,8 +220,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("invalid byte in string", b.graph.host);
case.addError("_ = \"\x01Q\";", &[_][]const u8{
- ":1:5: error: expected expression, found 'invalid bytes'",
- ":1:6: note: invalid byte: '\\x01'",
+ ":1:5: error: expected expression, found 'invalid token'",
});
}
@@ -231,8 +228,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("invalid byte in comment", b.graph.host);
case.addError("//\x01Q", &[_][]const u8{
- ":1:1: error: expected type expression, found 'invalid bytes'",
- ":1:3: note: invalid byte: '\\x01'",
+ ":1:1: error: expected type expression, found 'invalid token'",
});
}
@@ -240,8 +236,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("control character in character literal", b.graph.host);
case.addError("const c = '\x01';", &[_][]const u8{
- ":1:11: error: expected expression, found 'invalid bytes'",
- ":1:12: note: invalid byte: '\\x01'",
+ ":1:11: error: expected expression, found 'invalid token'",
});
}
@@ -249,8 +244,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("invalid byte at start of token", b.graph.host);
case.addError("x = \x00Q", &[_][]const u8{
- ":1:5: error: expected expression, found 'invalid bytes'",
- ":1:5: note: invalid byte: '\\x00'",
+ ":1:5: error: expected expression, found 'invalid token'",
});
}
}