Commit ae7392e504

Nick Erdmann <n@nirf.de>
2019-10-06 19:52:35
unicode character literals
1 parent 5711234
Changed files (4)
doc
lib
src
test
stage1
behavior
doc/langref.html.in
@@ -552,8 +552,7 @@ pub fn main() void {
       <p>
       Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
       {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
-      and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
-      character literals will be allowed to have a single UTF-8 encoded codepoint.
+      and character literals.
       </p>
       {#code_begin|test#}
 const assert = @import("std").debug.assert;
@@ -567,6 +566,7 @@ test "string literals" {
     assert(normal_bytes[1] == 'e');
     assert('e' == '\x65');
     assert('\u{1f4a9}' == 128169);
+    assert('๐Ÿ’ฏ' == 128175);
     assert(mem.eql(u8, "hello", "h\x65llo"));
 
     // A C string literal is a null terminated pointer.
lib/std/zig/tokenizer.zig
@@ -371,6 +371,7 @@ pub const Tokenizer = struct {
         CharLiteralUnicodeEscapeSawU,
         CharLiteralUnicodeEscape,
         CharLiteralUnicodeInvalid,
+        CharLiteralUnicode,
         CharLiteralEnd,
         Backslash,
         Equal,
@@ -427,6 +428,7 @@ pub const Tokenizer = struct {
             .end = undefined,
         };
         var seen_escape_digits: usize = undefined;
+        var remaining_code_units: usize = undefined;
         while (self.index < self.buffer.len) : (self.index += 1) {
             const c = self.buffer[self.index];
             switch (state) {
@@ -774,16 +776,23 @@ pub const Tokenizer = struct {
                     '\\' => {
                         state = State.CharLiteralBackslash;
                     },
-                    '\'' => {
+                    '\'', 0x80...0xbf, 0xf8...0xff => {
                         result.id = Token.Id.Invalid;
                         break;
                     },
+                    0xc0...0xdf => { // 110xxxxx
+                        remaining_code_units = 1;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xe0...0xef => { // 1110xxxx
+                        remaining_code_units = 2;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xf0...0xf7 => { // 11110xxx
+                        remaining_code_units = 3;
+                        state = State.CharLiteralUnicode;
+                    },
                     else => {
-                        if (c < 0x20 or c == 0x7f) {
-                            result.id = Token.Id.Invalid;
-                            break;
-                        }
-
                         state = State.CharLiteralEnd;
                     },
                 },
@@ -867,6 +876,19 @@ pub const Tokenizer = struct {
                     },
                 },
 
+                State.CharLiteralUnicode => switch (c) {
+                    0x80...0xbf => {
+                        remaining_code_units -= 1;
+                        if (remaining_code_units == 0) {
+                            state = State.CharLiteralEnd;
+                        }
+                    },
+                    else => {
+                        result.id = Token.Id.Invalid;
+                        break;
+                    },
+                },
+
                 State.MultilineStringLiteralLine => switch (c) {
                     '\n' => {
                         self.index += 1;
@@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
                 State.CharLiteralUnicodeEscape,
                 State.CharLiteralUnicodeInvalid,
                 State.CharLiteralEnd,
+                State.CharLiteralUnicode,
                 State.StringLiteralBackslash,
                 State.LBracketStar,
                 State.LBracketStarC,
@@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
     , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
 }
 
+test "tokenizer - char literal with unicode code point" {
+    testTokenize(
+        \\'๐Ÿ’ฉ'
+    , [_]Token.Id{.CharLiteral});
+}
+
 test "tokenizer - float literal e exponent" {
     testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
         Token.Id.Identifier,
src/tokenizer.cpp
@@ -193,6 +193,7 @@ enum TokenizeState {
     TokenizeStateStringEscapeUnicodeStart,
     TokenizeStateCharLiteral,
     TokenizeStateCharLiteralEnd,
+    TokenizeStateCharLiteralUnicode,
     TokenizeStateSawStar,
     TokenizeStateSawStarPercent,
     TokenizeStateSawSlash,
@@ -247,6 +248,7 @@ struct Tokenize {
     int exponent_in_bin_or_dec;
     BigInt specified_exponent;
     BigInt significand;
+    size_t remaining_code_units;
 };
 
 ATTRIBUTE_PRINTF(2, 3)
@@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
                 }
                 break;
             case TokenizeStateCharLiteral:
-                switch (c) {
-                    case '\'':
-                        tokenize_error(&t, "expected character");
-                        break;
-                    case '\\':
-                        t.state = TokenizeStateStringEscape;
-                        break;
-                    default:
-                        t.cur_tok->data.char_lit.c = c;
-                        t.state = TokenizeStateCharLiteralEnd;
-                        break;
+                if (c == '\'') {
+                    tokenize_error(&t, "expected character");
+                } else if (c == '\\') {
+                    t.state = TokenizeStateStringEscape;
+                } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
+                    // 10xxxxxx
+                    // 11111xxx
+                    invalid_char_error(&t, c);
+                } else if (c >= 0xc0 && c <= 0xdf) {
+                    // 110xxxxx
+                    t.cur_tok->data.char_lit.c = c & 0x1f;
+                    t.remaining_code_units = 1;
+                    t.state = TokenizeStateCharLiteralUnicode;
+                } else if (c >= 0xe0 && c <= 0xef) {
+                    // 1110xxxx
+                    t.cur_tok->data.char_lit.c = c & 0x0f;
+                    t.remaining_code_units = 2;
+                    t.state = TokenizeStateCharLiteralUnicode;
+                } else if (c >= 0xf0 && c <= 0xf7) {
+                    // 11110xxx
+                    t.cur_tok->data.char_lit.c = c & 0x07;
+                    t.remaining_code_units = 3;
+                    t.state = TokenizeStateCharLiteralUnicode;
+                } else {
+                    t.cur_tok->data.char_lit.c = c;
+                    t.state = TokenizeStateCharLiteralEnd;
                 }
                 break;
             case TokenizeStateCharLiteralEnd:
@@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
                         invalid_char_error(&t, c);
                 }
                 break;
+            case TokenizeStateCharLiteralUnicode:
+                if (c <= 0x7f || c >= 0xc0) {
+                    invalid_char_error(&t, c);
+                }
+                t.cur_tok->data.char_lit.c <<= 6;
+                t.cur_tok->data.char_lit.c += c & 0x3f;
+                t.remaining_code_units--;
+                if (t.remaining_code_units == 0) {
+                    t.state = TokenizeStateCharLiteralEnd;
+                }
+                break;
             case TokenizeStateZero:
                 switch (c) {
                     case 'b':
@@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
             break;
         case TokenizeStateCharLiteral:
         case TokenizeStateCharLiteralEnd:
+        case TokenizeStateCharLiteralUnicode:
             tokenize_error(&t, "unterminated character literal");
             break;
         case TokenizeStateSymbol:
test/stage1/behavior/misc.zig
@@ -699,6 +699,10 @@ test "unicode escape in character literal" {
     expect(a == 128169);
 }
 
+test "unicode character in character literal" {
+    expect('๐Ÿ’ฉ' == 128169);
+}
+
 test "result location zero sized array inside struct field implicit cast to slice" {
     const E = struct {
         entries: []u32,