Commit ae7392e504
Changed files (4)
doc/langref.html.in
@@ -552,8 +552,7 @@ pub fn main() void {
<p>
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
- and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
- character literals will be allowed to have a single UTF-8 encoded codepoint.
+ and character literals.
</p>
{#code_begin|test#}
const assert = @import("std").debug.assert;
@@ -567,6 +566,7 @@ test "string literals" {
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
assert('\u{1f4a9}' == 128169);
+ assert('๐ฏ' == 128175);
assert(mem.eql(u8, "hello", "h\x65llo"));
// A C string literal is a null terminated pointer.
lib/std/zig/tokenizer.zig
@@ -371,6 +371,7 @@ pub const Tokenizer = struct {
CharLiteralUnicodeEscapeSawU,
CharLiteralUnicodeEscape,
CharLiteralUnicodeInvalid,
+ CharLiteralUnicode,
CharLiteralEnd,
Backslash,
Equal,
@@ -427,6 +428,7 @@ pub const Tokenizer = struct {
.end = undefined,
};
var seen_escape_digits: usize = undefined;
+ var remaining_code_units: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
@@ -774,16 +776,23 @@ pub const Tokenizer = struct {
'\\' => {
state = State.CharLiteralBackslash;
},
- '\'' => {
+ '\'', 0x80...0xbf, 0xf8...0xff => {
result.id = Token.Id.Invalid;
break;
},
+ 0xc0...0xdf => { // 110xxxxx
+ remaining_code_units = 1;
+ state = State.CharLiteralUnicode;
+ },
+ 0xe0...0xef => { // 1110xxxx
+ remaining_code_units = 2;
+ state = State.CharLiteralUnicode;
+ },
+ 0xf0...0xf7 => { // 11110xxx
+ remaining_code_units = 3;
+ state = State.CharLiteralUnicode;
+ },
else => {
- if (c < 0x20 or c == 0x7f) {
- result.id = Token.Id.Invalid;
- break;
- }
-
state = State.CharLiteralEnd;
},
},
@@ -867,6 +876,19 @@ pub const Tokenizer = struct {
},
},
+ State.CharLiteralUnicode => switch (c) {
+ 0x80...0xbf => {
+ remaining_code_units -= 1;
+ if (remaining_code_units == 0) {
+ state = State.CharLiteralEnd;
+ }
+ },
+ else => {
+ result.id = Token.Id.Invalid;
+ break;
+ },
+ },
+
State.MultilineStringLiteralLine => switch (c) {
'\n' => {
self.index += 1;
@@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
State.CharLiteralUnicodeEscape,
State.CharLiteralUnicodeInvalid,
State.CharLiteralEnd,
+ State.CharLiteralUnicode,
State.StringLiteralBackslash,
State.LBracketStar,
State.LBracketStarC,
@@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
}
+test "tokenizer - char literal with unicode code point" {
+ testTokenize(
+ \\'๐ฉ'
+ , [_]Token.Id{.CharLiteral});
+}
+
test "tokenizer - float literal e exponent" {
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
Token.Id.Identifier,
src/tokenizer.cpp
@@ -193,6 +193,7 @@ enum TokenizeState {
TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd,
+ TokenizeStateCharLiteralUnicode,
TokenizeStateSawStar,
TokenizeStateSawStarPercent,
TokenizeStateSawSlash,
@@ -247,6 +248,7 @@ struct Tokenize {
int exponent_in_bin_or_dec;
BigInt specified_exponent;
BigInt significand;
+ size_t remaining_code_units;
};
ATTRIBUTE_PRINTF(2, 3)
@@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
}
break;
case TokenizeStateCharLiteral:
- switch (c) {
- case '\'':
- tokenize_error(&t, "expected character");
- break;
- case '\\':
- t.state = TokenizeStateStringEscape;
- break;
- default:
- t.cur_tok->data.char_lit.c = c;
- t.state = TokenizeStateCharLiteralEnd;
- break;
+ if (c == '\'') {
+ tokenize_error(&t, "expected character");
+ } else if (c == '\\') {
+ t.state = TokenizeStateStringEscape;
+ } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
+ // 10xxxxxx
+ // 11111xxx
+ invalid_char_error(&t, c);
+ } else if (c >= 0xc0 && c <= 0xdf) {
+ // 110xxxxx
+ t.cur_tok->data.char_lit.c = c & 0x1f;
+ t.remaining_code_units = 1;
+ t.state = TokenizeStateCharLiteralUnicode;
+ } else if (c >= 0xe0 && c <= 0xef) {
+ // 1110xxxx
+ t.cur_tok->data.char_lit.c = c & 0x0f;
+ t.remaining_code_units = 2;
+ t.state = TokenizeStateCharLiteralUnicode;
+ } else if (c >= 0xf0 && c <= 0xf7) {
+ // 11110xxx
+ t.cur_tok->data.char_lit.c = c & 0x07;
+ t.remaining_code_units = 3;
+ t.state = TokenizeStateCharLiteralUnicode;
+ } else {
+ t.cur_tok->data.char_lit.c = c;
+ t.state = TokenizeStateCharLiteralEnd;
}
break;
case TokenizeStateCharLiteralEnd:
@@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
+ case TokenizeStateCharLiteralUnicode:
+ if (c <= 0x7f || c >= 0xc0) {
+ invalid_char_error(&t, c);
+ }
+ t.cur_tok->data.char_lit.c <<= 6;
+ t.cur_tok->data.char_lit.c += c & 0x3f;
+ t.remaining_code_units--;
+ if (t.remaining_code_units == 0) {
+ t.state = TokenizeStateCharLiteralEnd;
+ }
+ break;
case TokenizeStateZero:
switch (c) {
case 'b':
@@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEnd:
+ case TokenizeStateCharLiteralUnicode:
tokenize_error(&t, "unterminated character literal");
break;
case TokenizeStateSymbol:
test/stage1/behavior/misc.zig
@@ -699,6 +699,10 @@ test "unicode escape in character literal" {
expect(a == 128169);
}
+test "unicode character in character literal" {
+ expect('๐ฉ' == 128169);
+}
+
test "result location zero sized array inside struct field implicit cast to slice" {
const E = struct {
entries: []u32,