Commit 346ec15c50
Changed files (6)
lib
std
src
doc/langref.html.in
@@ -11551,7 +11551,8 @@ fn readU32Be() u32 {}
</p>
<p>
Each LF may be immediately preceded by a single CR (byte value 0x0d, code point U+000d, {#syntax#}'\r'{#endsyntax#})
- to form a Windows style line ending, but this is discouraged.
+ to form a Windows style line ending, but this is discouraged. Note that in mulitline strings, CRLF sequences will
+ be encoded as LF when compiled into a zig program.
A CR in any other context is not allowed.
</p>
<p>
lib/std/zig/Ast.zig
@@ -171,7 +171,7 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
.index = token_starts[token_index],
.pending_invalid_token = null,
};
- const token = tokenizer.next();
+ const token = tokenizer.findTagAtCurrentIndex(token_tag);
assert(token.tag == token_tag);
return tree.source[token.loc.start..token.loc.end];
}
lib/std/zig/tokenizer.zig
@@ -406,6 +406,38 @@ pub const Tokenizer = struct {
saw_at_sign,
};
+ /// This is a workaround to the fact that the tokenizer can queue up
+ /// 'pending_invalid_token's when parsing literals, which means that we need
+ /// to scan from the start of the current line to find a matching tag - just
+ /// in case it was an invalid character generated during literal
+ /// tokenization. Ideally this processing of this would be pushed to the AST
+ /// parser or another later stage, both to give more useful error messages
+ /// with that extra context and in order to be able to remove this
+ /// workaround.
+ pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
+ if (tag == .invalid) {
+ const target_index = self.index;
+ var starting_index = target_index;
+ while (starting_index > 0) {
+ if (self.buffer[starting_index] == '\n') {
+ break;
+ }
+ starting_index -= 1;
+ }
+
+ self.index = starting_index;
+ while (self.index <= target_index or self.pending_invalid_token != null) {
+ const result = self.next();
+ if (result.loc.start == target_index and result.tag == tag) {
+ return result;
+ }
+ }
+ unreachable;
+ } else {
+ return self.next();
+ }
+ }
+
pub fn next(self: *Tokenizer) Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
@@ -1127,7 +1159,7 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
- '\t', '\r' => state = .line_comment,
+ '\t' => state = .line_comment,
else => {
state = .line_comment;
self.checkLiteralCharacter();
@@ -1141,7 +1173,7 @@ pub const Tokenizer = struct {
result.tag = .doc_comment;
break;
},
- '\t', '\r' => {
+ '\t' => {
state = .doc_comment;
result.tag = .doc_comment;
},
@@ -1163,12 +1195,12 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
- '\t', '\r' => {},
+ '\t' => {},
else => self.checkLiteralCharacter(),
},
.doc_comment => switch (c) {
0, '\n' => break,
- '\t', '\r' => {},
+ '\t' => {},
else => self.checkLiteralCharacter(),
},
.int => switch (c) {
@@ -1239,7 +1271,15 @@ pub const Tokenizer = struct {
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
const c0 = self.buffer[self.index];
if (std.ascii.isASCII(c0)) {
- if (std.ascii.isControl(c0)) {
+ if (c0 == '\r') {
+ if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
+ // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
+ // they constitute an illegal byte!
+ return 0;
+ } else {
+ return 1;
+ }
+ } else if (std.ascii.isControl(c0)) {
// ascii control codes are never allowed
// (note that \n was checked before we got here)
return 1;
src/AstGen.zig
@@ -10491,14 +10491,16 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
var tok_i = start;
{
const slice = tree.tokenSlice(tok_i);
- const line_bytes = slice[2 .. slice.len - 1];
+ const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
+ const line_bytes = slice[2 .. slice.len - carriage_return_ending];
try string_bytes.appendSlice(gpa, line_bytes);
tok_i += 1;
}
// Following lines: each line prepends a newline.
while (tok_i <= end) : (tok_i += 1) {
const slice = tree.tokenSlice(tok_i);
- const line_bytes = slice[2 .. slice.len - 1];
+ const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
+ const line_bytes = slice[2 .. slice.len - carriage_return_ending];
try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
string_bytes.appendAssumeCapacity('\n');
string_bytes.appendSliceAssumeCapacity(line_bytes);
test/compare_output.zig
@@ -535,4 +535,13 @@ pub fn addCases(cases: *tests.CompareOutputContext) void {
\\debug: free - len: 5
\\
);
+
+ cases.add("valid carriage return example", "const io = @import(\"std\").io;\r\n" ++ // Testing CRLF line endings are valid
+ "\r\n" ++
+ "pub \r fn main() void {\r\n" ++ // Testing isolated carriage return as whitespace is valid
+ " const stdout = io.getStdOut().writer();\r\n" ++
+ " stdout.print(\\\\A Multiline\r\n" ++ // testing CRLF at end of multiline string line is valid and normalises to \n in the output
+ " \\\\String\r\n" ++
+ " , .{}) catch unreachable;\r\n" ++
+ "}\r\n", "A Multiline\nString");
}
test/compile_errors.zig
@@ -174,6 +174,16 @@ pub fn addCases(ctx: *TestContext) !void {
});
}
+ {
+ const case = ctx.obj("isolated carriage return in multiline string literal", .{});
+ case.backend = .stage2;
+
+ case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
+ ":1:19: error: expected ';' after declaration",
+ ":1:20: note: invalid byte: '\\r'",
+ });
+ }
+
{
const case = ctx.obj("missing semicolon at EOF", .{});
case.addError(