Commit 89953ec83d
Changed files (7)
std
test
stage1
behavior
doc/langref.html.in
@@ -501,7 +501,16 @@ pub fn main() void {
</div>
{#see_also|Optionals|undefined#}
{#header_close#}
- {#header_open|String Literals#}
+ {#header_open|String Literals and Character Literals#}
+ <p>
+ String literals are UTF-8 encoded byte arrays.
+ </p>
+ <p>
+ Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
+ {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
+ and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
+ character literals will be allowed to have a single UTF-8 encoded codepoint.
+ </p>
{#code_begin|test#}
const assert = @import("std").debug.assert;
const mem = @import("std").mem;
@@ -513,6 +522,7 @@ test "string literals" {
assert(normal_bytes.len == 5);
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
+ assert('\U01f4a9' == 128169);
assert(mem.eql(u8, "hello", "h\x65llo"));
// A C string literal is a null terminated pointer.
@@ -521,7 +531,7 @@ test "string literals" {
assert(null_terminated_bytes[5] == 0);
}
{#code_end#}
- {#see_also|Arrays|Zig Test#}
+ {#see_also|Arrays|Zig Test|Source Encoding#}
{#header_open|Escape Sequences#}
<div class="table-wrapper">
<table>
@@ -8530,7 +8540,7 @@ pub fn main() void {
);
}
{#code_end#}
- {#see_also|String Literals#}
+ {#see_also|String Literals and Character Literals#}
{#header_close#}
{#header_open|Import from C Header File#}
src/all_types.hpp
@@ -845,7 +845,7 @@ struct AstNodeStringLiteral {
};
struct AstNodeCharLiteral {
- uint8_t value;
+ uint32_t value;
};
struct AstNodeFloatLiteral {
src/tokenizer.cpp
@@ -1103,11 +1103,15 @@ void tokenize(Buf *buf, Tokenization *out) {
if (t.char_code_index >= t.char_code_end) {
if (t.unicode) {
- if (t.char_code <= 0x7f) {
+ if (t.char_code > 0x10ffff) {
+ tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+ }
+ if (t.cur_tok->id == TokenIdCharLiteral) {
+ t.cur_tok->data.char_lit.c = t.char_code;
+ t.state = TokenizeStateCharLiteralEnd;
+ } else if (t.char_code <= 0x7f) {
// 00000000 00000000 00000000 0xxxxxxx
handle_string_escape(&t, (uint8_t)t.char_code);
- } else if (t.cur_tok->id == TokenIdCharLiteral) {
- tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code);
} else if (t.char_code <= 0x7ff) {
// 00000000 00000000 00000xxx xx000000
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
@@ -1129,14 +1133,9 @@ void tokenize(Buf *buf, Tokenization *out) {
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- } else {
- tokenize_error(&t, "unicode value out of range: %x", t.char_code);
}
} else {
- if (t.cur_tok->id == TokenIdCharLiteral && t.char_code > UINT8_MAX) {
- tokenize_error(&t, "value too large for character literal: '%x'",
- t.char_code);
- }
+ assert(t.char_code <= 255);
handle_string_escape(&t, (uint8_t)t.char_code);
}
}
src/tokenizer.hpp
@@ -148,7 +148,7 @@ struct TokenStrLit {
};
struct TokenCharLit {
- uint8_t c;
+ uint32_t c;
};
struct Token {
std/zig/parser_test.zig
@@ -1,3 +1,10 @@
+test "zig fmt: character literal larger than u8" {
+ try testCanonical(
+ \\const x = '\U01f4a9';
+ \\
+ );
+}
+
test "zig fmt: infix operator and then multiline string literal" {
try testCanonical(
\\const x = "" ++
std/zig/tokenizer.zig
@@ -236,8 +236,7 @@ pub const Tokenizer = struct {
MultilineStringLiteralLine,
CharLiteral,
CharLiteralBackslash,
- CharLiteralEscape1,
- CharLiteralEscape2,
+ CharLiteralHexEscape,
CharLiteralEnd,
Backslash,
Equal,
@@ -293,6 +292,8 @@ pub const Tokenizer = struct {
.start = self.index,
.end = undefined,
};
+ var seen_escape_digits: usize = undefined;
+ var expected_escape_digits: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
@@ -658,26 +659,31 @@ pub const Tokenizer = struct {
break;
},
'x' => {
- state = State.CharLiteralEscape1;
+ state = State.CharLiteralHexEscape;
+ seen_escape_digits = 0;
+ expected_escape_digits = 2;
},
- else => {
- state = State.CharLiteralEnd;
+ 'u' => {
+ state = State.CharLiteralHexEscape;
+ seen_escape_digits = 0;
+ expected_escape_digits = 4;
},
- },
-
- State.CharLiteralEscape1 => switch (c) {
- '0'...'9', 'a'...'z', 'A'...'F' => {
- state = State.CharLiteralEscape2;
+ 'U' => {
+ state = State.CharLiteralHexEscape;
+ seen_escape_digits = 0;
+ expected_escape_digits = 6;
},
else => {
- result.id = Token.Id.Invalid;
- break;
+ state = State.CharLiteralEnd;
},
},
- State.CharLiteralEscape2 => switch (c) {
+ State.CharLiteralHexEscape => switch (c) {
'0'...'9', 'a'...'z', 'A'...'F' => {
- state = State.CharLiteralEnd;
+ seen_escape_digits += 1;
+ if (seen_escape_digits == expected_escape_digits) {
+ state = State.CharLiteralEnd;
+ }
},
else => {
result.id = Token.Id.Invalid;
@@ -1045,8 +1051,7 @@ pub const Tokenizer = struct {
State.Backslash,
State.CharLiteral,
State.CharLiteralBackslash,
- State.CharLiteralEscape1,
- State.CharLiteralEscape2,
+ State.CharLiteralHexEscape,
State.CharLiteralEnd,
State.StringLiteralBackslash,
State.LBracketStar,
test/stage1/behavior/misc.zig
@@ -699,3 +699,8 @@ test "thread local variable" {
S.t += 1;
expect(S.t == 1235);
}
+
+test "unicode escape in character literal" {
+ var a: u24 = '\U01f4a9';
+ expect(a == 128169);
+}