Commit e144ddab24

Andrew Kelley <superjoe30@gmail.com>
2016-04-04 03:44:17
add multiline string literal
and make multiple lines in normal string literals an error
1 parent 5bae9ba
doc/vim/syntax/zig.vim
@@ -51,7 +51,7 @@ syn match     zigEscape        display contained /\\\([nrt0\\'"]\|x\x\{2}\)/
 syn match     zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/
 syn match     zigEscapeUnicode display contained /\\u{\x\{1,6}}/
 syn match     zigStringContinuation display contained /\\\n\s*/
-syn region    zigString      start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
+syn region    zigString      start=+r\?c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
 syn region    zigString      start='b\?r\z(#*\)"' end='"\z1' contains=@Spell
 
 let b:current_syntax = "zig"
doc/langref.md
@@ -267,13 +267,22 @@ from codegen.
 
 #### Character and String Literals
 ```
-Literal         Example   Characters   Escapes         Null Term  Type
+Literal            Example       Characters   Escapes         Null Term  Type
 
-Byte            'H'       All ASCII    Byte            No         u8
-UTF-8 Bytes     "hello"   All Unicode  Byte & Unicode  No         [5]u8
-UTF-8 C string  c"hello"  All Unicode  Byte & Unicode  Yes        &const u8
+Byte               'H'           All ASCII    Byte            No         u8
+UTF-8 Bytes        "hello"       All Unicode  Byte & Unicode  No         [5]u8
+UTF-8 C string     c"hello"      All Unicode  Byte & Unicode  Yes        &const u8
+UTF-8 Raw String   r"A(hello)A"  All Unicode  None            No         [5]u8
+UTF-8 Raw C String rc"A(hello)A" All Unicode  None            Yes        &const u8
 ```
 
+##### Raw Strings
+
+Raw string literals have no escapes and can span across multiple lines. To
+start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
+To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
+
+
 ```
 Escape  Name
 
src/parser.cpp
@@ -226,6 +226,16 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
 static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str,
         ZigList<SrcPos> *offset_map)
 {
+    if (token->raw_string_start > 0) {
+        uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos);
+        uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1);
+        assert(c1 == 'r');
+        *out_c_str = (c2 == 'c');
+        const char *str = buf_ptr(pc->buf) + token->raw_string_start;
+        buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start);
+        return;
+    }
+
     // skip the double quotes at beginning and end
     // convert escape sequences
     // detect c string literal
src/tokenizer.cpp
@@ -30,7 +30,7 @@
          '0': \
     case DIGIT_NON_ZERO
 
-#define ALPHA_EXCEPT_C \
+#define ALPHA_EXCEPT_CR \
          'a': \
     case 'b': \
   /*case 'c':*/ \
@@ -48,7 +48,7 @@
     case 'o': \
     case 'p': \
     case 'q': \
-    case 'r': \
+  /*case 'r':*/ \
     case 's': \
     case 't': \
     case 'u': \
@@ -85,11 +85,17 @@
     case 'Z'
 
 #define ALPHA \
-    ALPHA_EXCEPT_C: \
-    case 'c'
+    ALPHA_EXCEPT_CR: \
+    case 'c': \
+    case 'r'
 
 #define SYMBOL_CHAR \
-    ALPHA: \
+    SYMBOL_CHAR_EXCEPT_C: \
+    case 'c'
+
+#define SYMBOL_CHAR_EXCEPT_C \
+    ALPHA_EXCEPT_CR: \
+    case 'r': \
     case DIGIT: \
     case '_'
 
@@ -118,12 +124,17 @@ enum TokenizeState {
     TokenizeStateStart,
     TokenizeStateSymbol,
     TokenizeStateSymbolFirst,
+    TokenizeStateSymbolFirstRaw,
+    TokenizeStateFirstR,
     TokenizeStateZero, // "0", which might lead to "0x"
     TokenizeStateNumber, // "123", "0x123"
     TokenizeStateFloatFraction, // "123.456", "0x123.456"
     TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
     TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
     TokenizeStateString,
+    TokenizeStateRawString,
+    TokenizeStateRawStringContents,
+    TokenizeStateRawStringMaybeEnd,
     TokenizeStateCharLiteral,
     TokenizeStateSawStar,
     TokenizeStateSawSlash,
@@ -162,6 +173,9 @@ struct Tokenize {
     Token *cur_tok;
     int multi_line_comment_count;
     Tokenization *out;
+    int raw_string_id_start;
+    int raw_string_id_end;
+    int raw_string_id_cmp_pos;
 };
 
 __attribute__ ((format (printf, 2, 3)))
@@ -193,6 +207,8 @@ static void begin_token(Tokenize *t, TokenId id) {
     token->radix = 0;
     token->decimal_point_pos = 0;
     token->exponent_marker_pos = 0;
+    token->raw_string_start = 0;
+    token->raw_string_end = 0;
     t->cur_tok = token;
 }
 
@@ -324,7 +340,11 @@ void tokenize(Buf *buf, Tokenization *out) {
                         t.state = TokenizeStateSymbolFirst;
                         begin_token(&t, TokenIdSymbol);
                         break;
-                    case ALPHA_EXCEPT_C:
+                    case 'r':
+                        t.state = TokenizeStateFirstR;
+                        begin_token(&t, TokenIdSymbol);
+                        break;
+                    case ALPHA_EXCEPT_CR:
                     case '_':
                         t.state = TokenizeStateSymbol;
                         begin_token(&t, TokenIdSymbol);
@@ -821,6 +841,43 @@ void tokenize(Buf *buf, Tokenization *out) {
                         continue;
                 }
                 break;
+            case TokenizeStateSymbolFirstRaw:
+                switch (c) {
+                    case '"':
+                        t.cur_tok->id = TokenIdStringLiteral;
+                        t.state = TokenizeStateRawString;
+                        t.raw_string_id_start = t.pos + 1;
+                        break;
+                    case SYMBOL_CHAR:
+                        t.state = TokenizeStateSymbol;
+                        break;
+                    default:
+                        t.pos -= 1;
+                        end_token(&t);
+                        t.state = TokenizeStateStart;
+                        continue;
+                }
+                break;
+            case TokenizeStateFirstR:
+                switch (c) {
+                    case '"':
+                        t.cur_tok->id = TokenIdStringLiteral;
+                        t.state = TokenizeStateRawString;
+                        t.raw_string_id_start = t.pos + 1;
+                        break;
+                    case 'c':
+                        t.state = TokenizeStateSymbolFirstRaw;
+                        break;
+                    case SYMBOL_CHAR_EXCEPT_C:
+                        t.state = TokenizeStateSymbol;
+                        break;
+                    default:
+                        t.pos -= 1;
+                        end_token(&t);
+                        t.state = TokenizeStateStart;
+                        continue;
+                }
+                break;
             case TokenizeStateSymbol:
                 switch (c) {
                     case SYMBOL_CHAR:
@@ -838,10 +895,44 @@ void tokenize(Buf *buf, Tokenization *out) {
                         end_token(&t);
                         t.state = TokenizeStateStart;
                         break;
+                    case '\n':
+                        tokenize_error(&t, "use raw string for multiline string literal");
+                        break;
                     default:
                         break;
                 }
                 break;
+            case TokenizeStateRawString:
+                if (c == '(') {
+                    t.raw_string_id_end = t.pos;
+                    t.cur_tok->raw_string_start = t.pos + 1;
+                    t.state = TokenizeStateRawStringContents;
+                }
+                break;
+            case TokenizeStateRawStringContents:
+                if (c == ')') {
+                    t.state = TokenizeStateRawStringMaybeEnd;
+                    t.raw_string_id_cmp_pos = t.raw_string_id_start;
+                    t.cur_tok->raw_string_end = t.pos;
+                }
+                break;
+            case TokenizeStateRawStringMaybeEnd:
+                if (t.raw_string_id_cmp_pos >= t.raw_string_id_end &&
+                    c == '"')
+                {
+                    end_token(&t);
+                    t.state = TokenizeStateStart;
+                } else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) {
+                    if (c == ')') {
+                        t.raw_string_id_cmp_pos = t.raw_string_id_start;
+                        t.cur_tok->raw_string_end = t.pos;
+                    } else {
+                        t.state = TokenizeStateRawStringContents;
+                    }
+                } else {
+                    t.raw_string_id_cmp_pos += 1;
+                }
+                break;
             case TokenizeStateCharLiteral:
                 switch (c) {
                     case '\'':
@@ -1002,11 +1093,18 @@ void tokenize(Buf *buf, Tokenization *out) {
         case TokenizeStateString:
             tokenize_error(&t, "unterminated string");
             break;
+        case TokenizeStateRawString:
+        case TokenizeStateRawStringContents:
+        case TokenizeStateRawStringMaybeEnd:
+            tokenize_error(&t, "unterminated raw string");
+            break;
         case TokenizeStateCharLiteral:
             tokenize_error(&t, "unterminated character literal");
             break;
         case TokenizeStateSymbol:
         case TokenizeStateSymbolFirst:
+        case TokenizeStateSymbolFirstRaw:
+        case TokenizeStateFirstR:
         case TokenizeStateZero:
         case TokenizeStateNumber:
         case TokenizeStateFloatFraction:
src/tokenizer.hpp
@@ -112,6 +112,10 @@ struct Token {
     int radix; // if != 10, then skip the first 2 characters
     int decimal_point_pos; // either exponent_marker_pos or the position of the '.'
     int exponent_marker_pos; // either end_pos or the position of the 'e'/'p'
+
+    // for id == TokenIdStringLiteral
+    int raw_string_start;
+    int raw_string_end;
 };
 
 struct Tokenization {
test/run_tests.cpp
@@ -1770,6 +1770,12 @@ fn f() {
     const std = @import("std");
 }
     )SOURCE", 1, ".tmp_source.zig:3:17: error: @import invalid inside function bodies");
+
+
+    add_compile_fail_case("normal string with newline", R"SOURCE(
+const foo = "a
+b";
+    )SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal");
 }
 
 //////////////////////////////////////////////////////////////////////////////
test/self_hosted.zig
@@ -495,7 +495,31 @@ fn count_trailing_zeroes() {
 }
 
 
+#attribute("test")
+fn multiline_string() {
+    const s1 = r"AOEU(
+one
+two)
+three)AOEU";
+    const s2 = "\none\ntwo)\nthree";
+    const s3 = r"(
+one
+two)
+three)";
+    assert(str_eql(s1, s2));
+    assert(str_eql(s3, s2));
+}
+
+
 
 fn assert(b: bool) {
     if (!b) unreachable{}
 }
+
+fn str_eql(s1: []u8, s2: []u8) -> bool {
+    if (s1.len != s2.len) return false;
+    for (s1) |c, i| {
+        if (s2[i] != c) return false;
+    }
+    return true;
+}