Commit 9ccd0ba961

Andrew Kelley <superjoe30@gmail.com>
2016-05-01 23:53:48
implement string escapes
1 parent 037283c
doc/langref.md
@@ -272,10 +272,26 @@ Literal            Example       Characters   Escapes         Null Term  Type
 Byte               'H'           All ASCII    Byte            No         u8
 UTF-8 Bytes        "hello"       All Unicode  Byte & Unicode  No         [5]u8
 UTF-8 C string     c"hello"      All Unicode  Byte & Unicode  Yes        &const u8
-UTF-8 Raw String   r"A(hello)A"  All Unicode  None            No         [5]u8
-UTF-8 Raw C String rc"A(hello)A" All Unicode  None            Yes        &const u8
+UTF-8 Raw String   r"X(hello)X"  All Unicode  None            No         [5]u8
+UTF-8 Raw C String rc"X(hello)X" All Unicode  None            Yes        &const u8
 ```
 
+### Escapes
+
+ Escape   | Name
+----------|-------------------------------------------------------------------
+ \n       | Newline
+ \r       | Carriage Return
+ \t       | Tab
+ \\       | Backslash
+ \'       | Single Quote
+ \"       | Double Quote
+ \xNN     | hexadecimal 8-bit character code (2 digits)
+ \uNNNN   | hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)
+ \UNNNNNN | hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)
+
+Note that the maximum valid Unicode point is 0x10ffff.
+
 ##### Raw Strings
 
 Raw string literals have no escapes and can span across multiple lines. To
@@ -283,25 +299,6 @@ start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
 To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
 
 
-```
-Escape  Name
-
-\xNN    hexadecimal 8-bit character code (exactly 2 digits)
-\n      Newline
-\r      Carriage return
-\t      Tab
-\\      Backslash
-\0      Null
-\'      Single quote
-\"      Double quote
-```
-
-### Unicode Escapes
-
- Escape     | Name
-------------|-----------------------------------------------
- \u{NNNNNN} | hexadecimal 24-bit Unicode character code (up to 6 digits)
-
 #### Numeric Literals
 
 ```
src/parser.cpp
@@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
     return return_value;
 }
 
-static int get_hex_digit(uint8_t c) {
+static uint32_t get_hex_digit(uint8_t c) {
     switch (c) {
         case '0': return 0;
         case '1': return 1;
@@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) {
         case 'F':
             return 15;
         default:
-            return -1;
+            return UINT32_MAX;
     }
 }
 
@@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
         StateEscape,
         StateHex1,
         StateHex2,
+        StateUnicode,
     };
 
     buf_resize(buf, 0);
 
+    int unicode_index;
+    int unicode_end;
+
     State state = StatePre;
     SrcPos pos = {token->start_line, token->start_column};
-    int hex_value = 0;
+    uint32_t hex_value = 0;
     for (int i = token->start_pos; i < token->end_pos - 1; i += 1) {
         uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i);
 
@@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
                         if (offset_map) offset_map->append(pos);
                         state = StateStart;
                         break;
+                    case '\'':
+                        buf_append_char(buf, '\'');
+                        if (offset_map) offset_map->append(pos);
+                        state = StateStart;
+                        break;
                     case 'x':
                         state = StateHex1;
                         break;
+                    case 'u':
+                        state = StateUnicode;
+                        unicode_index = 0;
+                        unicode_end = 4;
+                        hex_value = 0;
+                        break;
+                    case 'U':
+                        state = StateUnicode;
+                        unicode_index = 0;
+                        unicode_end = 6;
+                        hex_value = 0;
+                        break;
                     default:
                         ast_error(pc, token, "invalid escape character");
                 }
                 break;
             case StateHex1:
                 {
-                    int hex_digit = get_hex_digit(c);
-                    if (hex_digit == -1) {
+                    uint32_t hex_digit = get_hex_digit(c);
+                    if (hex_digit == UINT32_MAX) {
                         ast_error(pc, token, "invalid hex digit: '%c'", c);
                     }
                     hex_value = hex_digit * 16;
@@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
                 }
             case StateHex2:
                 {
-                    int hex_digit = get_hex_digit(c);
-                    if (hex_digit == -1) {
+                    uint32_t hex_digit = get_hex_digit(c);
+                    if (hex_digit == UINT32_MAX) {
                         ast_error(pc, token, "invalid hex digit: '%c'", c);
                     }
                     hex_value += hex_digit;
@@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
                     state = StateStart;
                     break;
                 }
+            case StateUnicode:
+                {
+                    uint32_t hex_digit = get_hex_digit(c);
+                    if (hex_digit == UINT32_MAX) {
+                        ast_error(pc, token, "invalid hex digit: '%c'", c);
+                    }
+                    hex_value *= 16;
+                    hex_value += hex_digit;
+                    unicode_index += 1;
+                    if (unicode_index >= unicode_end) {
+                        if (hex_value <= 0x7f) {
+                            // 00000000 00000000 00000000 0xxxxxxx
+                            buf_append_char(buf, hex_value);
+                        } else if (hex_value <= 0x7ff) {
+                            // 00000000 00000000 00000xxx xx000000
+                            buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+                        } else if (hex_value <= 0xffff) {
+                            // 00000000 00000000 xxxx0000 00000000
+                            buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+                        } else if (hex_value <= 0x10ffff) {
+                            // 00000000 000xxx00 00000000 00000000
+                            buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18)));
+                            // 00000000 000000xx xxxx0000 00000000
+                            buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+                        } else {
+                            ast_error(pc, token, "unicode value out of range: %x", hex_value);
+                        }
+                        state = StateStart;
+                    }
+                    break;
+                }
         }
         if (c == '\n') {
             pos.line += 1;
src/tokenizer.cpp
@@ -103,6 +103,21 @@
     ALPHA: \
     case '_'
 
+#define HEX_DIGIT \
+         'a': \
+    case 'b': \
+    case 'c': \
+    case 'd': \
+    case 'e': \
+    case 'f': \
+    case 'A': \
+    case 'B': \
+    case 'C': \
+    case 'D': \
+    case 'E': \
+    case 'F': \
+    case DIGIT
+
 const char * zig_keywords[] = {
     "true", "false", "null", "fn", "return", "var", "const", "extern",
     "pub", "export", "use", "if", "else", "goto", "asm",
@@ -132,11 +147,11 @@ enum TokenizeState {
     TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
     TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
     TokenizeStateString,
+    TokenizeStateStringEscape,
     TokenizeStateRawString,
     TokenizeStateRawStringContents,
     TokenizeStateRawStringMaybeEnd,
     TokenizeStateCharLiteral,
-    TokenizeStateCharLiteralEscape,
     TokenizeStateCharLiteralEnd,
     TokenizeStateSawStar,
     TokenizeStateSawSlash,
@@ -162,6 +177,7 @@ enum TokenizeState {
     TokenizeStateSawDotDot,
     TokenizeStateSawQuestionMark,
     TokenizeStateSawAtSign,
+    TokenizeStateHex,
     TokenizeStateError,
 };
 
@@ -179,6 +195,7 @@ struct Tokenize {
     int raw_string_id_start;
     int raw_string_id_end;
     int raw_string_id_cmp_pos;
+    int hex_chars_left;
 };
 
 __attribute__ ((format (printf, 2, 3)))
@@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) {
                     case '\n':
                         tokenize_error(&t, "use raw string for multiline string literal");
                         break;
+                    case '\\':
+                        t.state = TokenizeStateStringEscape;
+                        break;
                     default:
                         break;
                 }
                 break;
+            case TokenizeStateStringEscape:
+                switch (c) {
+                    case 'x':
+                        t.state = TokenizeStateHex;
+                        t.hex_chars_left = 2;
+                        break;
+                    case 'u':
+                        t.state = TokenizeStateHex;
+                        t.hex_chars_left = 4;
+                        break;
+                    case 'U':
+                        t.state = TokenizeStateHex;
+                        t.hex_chars_left = 6;
+                        break;
+                    case 'n':
+                    case 'r':
+                    case '\\':
+                    case 't':
+                    case '\'':
+                    case '"':
+                        if (t.cur_tok->id == TokenIdCharLiteral) {
+                            t.state = TokenizeStateCharLiteralEnd;
+                        } else if (t.cur_tok->id == TokenIdStringLiteral) {
+                            t.state = TokenizeStateString;
+                        } else {
+                            zig_unreachable();
+                        }
+                        break;
+                    default:
+                        tokenize_error(&t, "invalid character: '%c'", c);
+                }
+                break;
+            case TokenizeStateHex:
+                switch (c) {
+                    case HEX_DIGIT:
+                        t.hex_chars_left -= 1;
+                        if (t.hex_chars_left == 0) {
+                            if (t.cur_tok->id == TokenIdCharLiteral) {
+                                t.state = TokenizeStateCharLiteralEnd;
+                            } else if (t.cur_tok->id == TokenIdStringLiteral) {
+                                t.state = TokenizeStateString;
+                            } else {
+                                zig_unreachable();
+                            }
+                        }
+                        break;
+                    default:
+                        tokenize_error(&t, "invalid character: '%c'", c);
+                }
+                break;
             case TokenizeStateRawString:
                 if (c == '(') {
                     t.raw_string_id_end = t.pos;
@@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) {
                         t.state = TokenizeStateStart;
                         break;
                     case '\\':
-                        t.state = TokenizeStateCharLiteralEscape;
+                        t.state = TokenizeStateStringEscape;
                         break;
                     default:
                         t.state = TokenizeStateCharLiteralEnd;
                         break;
                 }
                 break;
-            case TokenizeStateCharLiteralEscape:
-                t.state = TokenizeStateCharLiteralEnd;
-                break;
             case TokenizeStateCharLiteralEnd:
                 switch (c) {
                     case '\'':
@@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) {
         case TokenizeStateString:
             tokenize_error(&t, "unterminated string");
             break;
+        case TokenizeStateStringEscape:
+        case TokenizeStateHex:
+            if (t.cur_tok->id == TokenIdStringLiteral) {
+                tokenize_error(&t, "unterminated string");
+            } else if (t.cur_tok->id == TokenIdCharLiteral) {
+                tokenize_error(&t, "unterminated character literal");
+            } else {
+                zig_unreachable();
+            }
+            break;
         case TokenizeStateRawString:
         case TokenizeStateRawStringContents:
         case TokenizeStateRawStringMaybeEnd:
             tokenize_error(&t, "unterminated raw string");
             break;
         case TokenizeStateCharLiteral:
-        case TokenizeStateCharLiteralEscape:
         case TokenizeStateCharLiteralEnd:
             tokenize_error(&t, "unterminated character literal");
             break;
test/self_hosted.zig
@@ -1398,3 +1398,14 @@ fn test_take_address_of_parameter_noeval(f: f32) {
 fn array_mult_operator() {
     assert(str.eql("ab" ** 5, "ababababab"));
 }
+
+#attribute("test")
+fn string_escapes() {
+    assert(str.eql("\"", "\x22"));
+    assert(str.eql("\'", "\x27"));
+    assert(str.eql("\n", "\x0a"));
+    assert(str.eql("\r", "\x0d"));
+    assert(str.eql("\t", "\x09"));
+    assert(str.eql("\\", "\x5c"));
+    assert(str.eql("\u1234\u0069", "\xe1\x88\xb4\x69"));
+}