Commit e144ddab24
Changed files (7)
doc
vim
syntax
doc/vim/syntax/zig.vim
@@ -51,7 +51,7 @@ syn match zigEscape display contained /\\\([nrt0\\'"]\|x\x\{2}\)/
syn match zigEscapeUnicode display contained /\\\(u\x\{4}\|U\x\{8}\)/
syn match zigEscapeUnicode display contained /\\u{\x\{1,6}}/
syn match zigStringContinuation display contained /\\\n\s*/
-syn region zigString start=+c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
+syn region zigString start=+r\?c\?"+ skip=+\\\\\|\\"+ end=+"+ contains=zigEscape,zigEscapeUnicode,zigEscapeError,zigStringContinuation,@Spell
syn region zigString start='b\?r\z(#*\)"' end='"\z1' contains=@Spell
let b:current_syntax = "zig"
doc/langref.md
@@ -267,13 +267,22 @@ from codegen.
#### Character and String Literals
```
-Literal Example Characters Escapes Null Term Type
+Literal Example Characters Escapes Null Term Type
-Byte 'H' All ASCII Byte No u8
-UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
-UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
+Byte 'H' All ASCII Byte No u8
+UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
+UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
+UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8
+UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8
```
+##### Raw Strings
+
+Raw string literals have no escapes and can span across multiple lines. To
+start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
+To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
+
+
```
Escape Name
src/parser.cpp
@@ -226,6 +226,16 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool *out_c_str,
ZigList<SrcPos> *offset_map)
{
+ if (token->raw_string_start > 0) {
+ uint8_t c1 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos);
+ uint8_t c2 = *((uint8_t*)buf_ptr(pc->buf) + token->start_pos + 1);
+ assert(c1 == 'r');
+ *out_c_str = (c2 == 'c');
+ const char *str = buf_ptr(pc->buf) + token->raw_string_start;
+ buf_init_from_mem(buf, str, token->raw_string_end - token->raw_string_start);
+ return;
+ }
+
// skip the double quotes at beginning and end
// convert escape sequences
// detect c string literal
src/tokenizer.cpp
@@ -30,7 +30,7 @@
'0': \
case DIGIT_NON_ZERO
-#define ALPHA_EXCEPT_C \
+#define ALPHA_EXCEPT_CR \
'a': \
case 'b': \
/*case 'c':*/ \
@@ -48,7 +48,7 @@
case 'o': \
case 'p': \
case 'q': \
- case 'r': \
+ /*case 'r':*/ \
case 's': \
case 't': \
case 'u': \
@@ -85,11 +85,17 @@
case 'Z'
#define ALPHA \
- ALPHA_EXCEPT_C: \
- case 'c'
+ ALPHA_EXCEPT_CR: \
+ case 'c': \
+ case 'r'
#define SYMBOL_CHAR \
- ALPHA: \
+ SYMBOL_CHAR_EXCEPT_C: \
+ case 'c'
+
+#define SYMBOL_CHAR_EXCEPT_C \
+ ALPHA_EXCEPT_CR: \
+ case 'r': \
case DIGIT: \
case '_'
@@ -118,12 +124,17 @@ enum TokenizeState {
TokenizeStateStart,
TokenizeStateSymbol,
TokenizeStateSymbolFirst,
+ TokenizeStateSymbolFirstRaw,
+ TokenizeStateFirstR,
TokenizeStateZero, // "0", which might lead to "0x"
TokenizeStateNumber, // "123", "0x123"
TokenizeStateFloatFraction, // "123.456", "0x123.456"
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
+ TokenizeStateRawString,
+ TokenizeStateRawStringContents,
+ TokenizeStateRawStringMaybeEnd,
TokenizeStateCharLiteral,
TokenizeStateSawStar,
TokenizeStateSawSlash,
@@ -162,6 +173,9 @@ struct Tokenize {
Token *cur_tok;
int multi_line_comment_count;
Tokenization *out;
+ int raw_string_id_start;
+ int raw_string_id_end;
+ int raw_string_id_cmp_pos;
};
__attribute__ ((format (printf, 2, 3)))
@@ -193,6 +207,8 @@ static void begin_token(Tokenize *t, TokenId id) {
token->radix = 0;
token->decimal_point_pos = 0;
token->exponent_marker_pos = 0;
+ token->raw_string_start = 0;
+ token->raw_string_end = 0;
t->cur_tok = token;
}
@@ -324,7 +340,11 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateSymbolFirst;
begin_token(&t, TokenIdSymbol);
break;
- case ALPHA_EXCEPT_C:
+ case 'r':
+ t.state = TokenizeStateFirstR;
+ begin_token(&t, TokenIdSymbol);
+ break;
+ case ALPHA_EXCEPT_CR:
case '_':
t.state = TokenizeStateSymbol;
begin_token(&t, TokenIdSymbol);
@@ -821,6 +841,43 @@ void tokenize(Buf *buf, Tokenization *out) {
continue;
}
break;
+ case TokenizeStateSymbolFirstRaw:
+ switch (c) {
+ case '"':
+ t.cur_tok->id = TokenIdStringLiteral;
+ t.state = TokenizeStateRawString;
+ t.raw_string_id_start = t.pos + 1;
+ break;
+ case SYMBOL_CHAR:
+ t.state = TokenizeStateSymbol;
+ break;
+ default:
+ t.pos -= 1;
+ end_token(&t);
+ t.state = TokenizeStateStart;
+ continue;
+ }
+ break;
+ case TokenizeStateFirstR:
+ switch (c) {
+ case '"':
+ t.cur_tok->id = TokenIdStringLiteral;
+ t.state = TokenizeStateRawString;
+ t.raw_string_id_start = t.pos + 1;
+ break;
+ case 'c':
+ t.state = TokenizeStateSymbolFirstRaw;
+ break;
+ case SYMBOL_CHAR_EXCEPT_C:
+ t.state = TokenizeStateSymbol;
+ break;
+ default:
+ t.pos -= 1;
+ end_token(&t);
+ t.state = TokenizeStateStart;
+ continue;
+ }
+ break;
case TokenizeStateSymbol:
switch (c) {
case SYMBOL_CHAR:
@@ -838,10 +895,44 @@ void tokenize(Buf *buf, Tokenization *out) {
end_token(&t);
t.state = TokenizeStateStart;
break;
+ case '\n':
+ tokenize_error(&t, "use raw string for multiline string literal");
+ break;
default:
break;
}
break;
+ case TokenizeStateRawString:
+ if (c == '(') {
+ t.raw_string_id_end = t.pos;
+ t.cur_tok->raw_string_start = t.pos + 1;
+ t.state = TokenizeStateRawStringContents;
+ }
+ break;
+ case TokenizeStateRawStringContents:
+ if (c == ')') {
+ t.state = TokenizeStateRawStringMaybeEnd;
+ t.raw_string_id_cmp_pos = t.raw_string_id_start;
+ t.cur_tok->raw_string_end = t.pos;
+ }
+ break;
+ case TokenizeStateRawStringMaybeEnd:
+ if (t.raw_string_id_cmp_pos >= t.raw_string_id_end &&
+ c == '"')
+ {
+ end_token(&t);
+ t.state = TokenizeStateStart;
+ } else if (c != buf_ptr(t.buf)[t.raw_string_id_cmp_pos]) {
+ if (c == ')') {
+ t.raw_string_id_cmp_pos = t.raw_string_id_start;
+ t.cur_tok->raw_string_end = t.pos;
+ } else {
+ t.state = TokenizeStateRawStringContents;
+ }
+ } else {
+ t.raw_string_id_cmp_pos += 1;
+ }
+ break;
case TokenizeStateCharLiteral:
switch (c) {
case '\'':
@@ -1002,11 +1093,18 @@ void tokenize(Buf *buf, Tokenization *out) {
case TokenizeStateString:
tokenize_error(&t, "unterminated string");
break;
+ case TokenizeStateRawString:
+ case TokenizeStateRawStringContents:
+ case TokenizeStateRawStringMaybeEnd:
+ tokenize_error(&t, "unterminated raw string");
+ break;
case TokenizeStateCharLiteral:
tokenize_error(&t, "unterminated character literal");
break;
case TokenizeStateSymbol:
case TokenizeStateSymbolFirst:
+ case TokenizeStateSymbolFirstRaw:
+ case TokenizeStateFirstR:
case TokenizeStateZero:
case TokenizeStateNumber:
case TokenizeStateFloatFraction:
src/tokenizer.hpp
@@ -112,6 +112,10 @@ struct Token {
int radix; // if != 10, then skip the first 2 characters
int decimal_point_pos; // either exponent_marker_pos or the position of the '.'
int exponent_marker_pos; // either end_pos or the position of the 'e'/'p'
+
+ // for id == TokenIdStringLiteral
+ int raw_string_start;
+ int raw_string_end;
};
struct Tokenization {
test/run_tests.cpp
@@ -1770,6 +1770,12 @@ fn f() {
const std = @import("std");
}
)SOURCE", 1, ".tmp_source.zig:3:17: error: @import invalid inside function bodies");
+
+
+ add_compile_fail_case("normal string with newline", R"SOURCE(
+const foo = "a
+b";
+ )SOURCE", 1, ".tmp_source.zig:2:13: error: use raw string for multiline string literal");
}
//////////////////////////////////////////////////////////////////////////////
test/self_hosted.zig
@@ -495,7 +495,31 @@ fn count_trailing_zeroes() {
}
+#attribute("test")
+fn multiline_string() {
+ const s1 = r"AOEU(
+one
+two)
+three)AOEU";
+ const s2 = "\none\ntwo)\nthree";
+ const s3 = r"(
+one
+two)
+three)";
+ assert(str_eql(s1, s2));
+ assert(str_eql(s3, s2));
+}
+
+
fn assert(b: bool) {
if (!b) unreachable{}
}
+
+fn str_eql(s1: []u8, s2: []u8) -> bool {
+ if (s1.len != s2.len) return false;
+ for (s1) |c, i| {
+ if (s2[i] != c) return false;
+ }
+ return true;
+}