Commit 89953ec83d

Andrew Kelley <andrew@ziglang.org>
2019-03-23 22:35:21
character literals: allow unicode escapes
also make the documentation for character literals more clear. closes #2089 see #2097
1 parent 55cb9ef
doc/langref.html.in
@@ -501,7 +501,16 @@ pub fn main() void {
       </div>
       {#see_also|Optionals|undefined#}
       {#header_close#}
-      {#header_open|String Literals#}
+      {#header_open|String Literals and Character Literals#}
+      <p>
+      String literals are UTF-8 encoded byte arrays.
+      </p>
+      <p>
+      Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
+      {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
+      and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
+      character literals will be allowed to have a single UTF-8 encoded codepoint.
+      </p>
       {#code_begin|test#}
 const assert = @import("std").debug.assert;
 const mem = @import("std").mem;
@@ -513,6 +522,7 @@ test "string literals" {
     assert(normal_bytes.len == 5);
     assert(normal_bytes[1] == 'e');
     assert('e' == '\x65');
+    assert('\U01f4a9' == 128169);
     assert(mem.eql(u8, "hello", "h\x65llo"));
 
     // A C string literal is a null terminated pointer.
@@ -521,7 +531,7 @@ test "string literals" {
     assert(null_terminated_bytes[5] == 0);
 }
       {#code_end#}
-      {#see_also|Arrays|Zig Test#}
+      {#see_also|Arrays|Zig Test|Source Encoding#}
       {#header_open|Escape Sequences#}
       <div class="table-wrapper">
       <table>
@@ -8530,7 +8540,7 @@ pub fn main() void {
     );
 }
       {#code_end#}
-      {#see_also|String Literals#}
+      {#see_also|String Literals and Character Literals#}
       {#header_close#}
 
       {#header_open|Import from C Header File#}
src/all_types.hpp
@@ -845,7 +845,7 @@ struct AstNodeStringLiteral {
 };
 
 struct AstNodeCharLiteral {
-    uint8_t value;
+    uint32_t value;
 };
 
 struct AstNodeFloatLiteral {
src/tokenizer.cpp
@@ -1103,11 +1103,15 @@ void tokenize(Buf *buf, Tokenization *out) {
 
                     if (t.char_code_index >= t.char_code_end) {
                         if (t.unicode) {
-                            if (t.char_code <= 0x7f) {
+                            if (t.char_code > 0x10ffff) {
+                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+                            }
+                            if (t.cur_tok->id == TokenIdCharLiteral) {
+                                t.cur_tok->data.char_lit.c = t.char_code;
+                                t.state = TokenizeStateCharLiteralEnd;
+                            } else if (t.char_code <= 0x7f) {
                                 // 00000000 00000000 00000000 0xxxxxxx
                                 handle_string_escape(&t, (uint8_t)t.char_code);
-                            } else if (t.cur_tok->id == TokenIdCharLiteral) {
-                                tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code);
                             } else if (t.char_code <= 0x7ff) {
                                 // 00000000 00000000 00000xxx xx000000
                                 handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
@@ -1129,14 +1133,9 @@ void tokenize(Buf *buf, Tokenization *out) {
                                 handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
                                 // 00000000 00000000 00000000 00xxxxxx
                                 handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else {
-                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
                             }
                         } else {
-                            if (t.cur_tok->id == TokenIdCharLiteral && t.char_code > UINT8_MAX) {
-                                tokenize_error(&t, "value too large for character literal: '%x'",
-                                        t.char_code);
-                            }
+                            assert(t.char_code <= 255);
                             handle_string_escape(&t, (uint8_t)t.char_code);
                         }
                     }
src/tokenizer.hpp
@@ -148,7 +148,7 @@ struct TokenStrLit {
 };
 
 struct TokenCharLit {
-    uint8_t c;
+    uint32_t c;
 };
 
 struct Token {
std/zig/parser_test.zig
@@ -1,3 +1,10 @@
+test "zig fmt: character literal larger than u8" {
+    try testCanonical(
+        \\const x = '\U01f4a9';
+        \\
+    );
+}
+
 test "zig fmt: infix operator and then multiline string literal" {
     try testCanonical(
         \\const x = "" ++
std/zig/tokenizer.zig
@@ -236,8 +236,7 @@ pub const Tokenizer = struct {
         MultilineStringLiteralLine,
         CharLiteral,
         CharLiteralBackslash,
-        CharLiteralEscape1,
-        CharLiteralEscape2,
+        CharLiteralHexEscape,
         CharLiteralEnd,
         Backslash,
         Equal,
@@ -293,6 +292,8 @@ pub const Tokenizer = struct {
             .start = self.index,
             .end = undefined,
         };
+        var seen_escape_digits: usize = undefined;
+        var expected_escape_digits: usize = undefined;
         while (self.index < self.buffer.len) : (self.index += 1) {
             const c = self.buffer[self.index];
             switch (state) {
@@ -658,26 +659,31 @@ pub const Tokenizer = struct {
                         break;
                     },
                     'x' => {
-                        state = State.CharLiteralEscape1;
+                        state = State.CharLiteralHexEscape;
+                        seen_escape_digits = 0;
+                        expected_escape_digits = 2;
                     },
-                    else => {
-                        state = State.CharLiteralEnd;
+                    'u' => {
+                        state = State.CharLiteralHexEscape;
+                        seen_escape_digits = 0;
+                        expected_escape_digits = 4;
                     },
-                },
-
-                State.CharLiteralEscape1 => switch (c) {
-                    '0'...'9', 'a'...'z', 'A'...'F' => {
-                        state = State.CharLiteralEscape2;
+                    'U' => {
+                        state = State.CharLiteralHexEscape;
+                        seen_escape_digits = 0;
+                        expected_escape_digits = 6;
                     },
                     else => {
-                        result.id = Token.Id.Invalid;
-                        break;
+                        state = State.CharLiteralEnd;
                     },
                 },
 
-                State.CharLiteralEscape2 => switch (c) {
+                State.CharLiteralHexEscape => switch (c) {
                     '0'...'9', 'a'...'z', 'A'...'F' => {
-                        state = State.CharLiteralEnd;
+                        seen_escape_digits += 1;
+                        if (seen_escape_digits == expected_escape_digits) {
+                            state = State.CharLiteralEnd;
+                        }
                     },
                     else => {
                         result.id = Token.Id.Invalid;
@@ -1045,8 +1051,7 @@ pub const Tokenizer = struct {
                 State.Backslash,
                 State.CharLiteral,
                 State.CharLiteralBackslash,
-                State.CharLiteralEscape1,
-                State.CharLiteralEscape2,
+                State.CharLiteralHexEscape,
                 State.CharLiteralEnd,
                 State.StringLiteralBackslash,
                 State.LBracketStar,
test/stage1/behavior/misc.zig
@@ -699,3 +699,8 @@ test "thread local variable" {
     S.t += 1;
     expect(S.t == 1235);
 }
+
+test "unicode escape in character literal" {
+    var a: u24 = '\U01f4a9';
+    expect(a == 128169);
+}