Commit 9ccd0ba961
Changed files (4)
doc/langref.md
@@ -272,10 +272,26 @@ Literal Example Characters Escapes Null Term Type
Byte 'H' All ASCII Byte No u8
UTF-8 Bytes "hello" All Unicode Byte & Unicode No [5]u8
UTF-8 C string c"hello" All Unicode Byte & Unicode Yes &const u8
-UTF-8 Raw String r"A(hello)A" All Unicode None No [5]u8
-UTF-8 Raw C String rc"A(hello)A" All Unicode None Yes &const u8
+UTF-8 Raw String r"X(hello)X" All Unicode None No [5]u8
+UTF-8 Raw C String rc"X(hello)X" All Unicode None Yes &const u8
```
+### Escapes
+
+ Escape | Name
+----------|-------------------------------------------------------------------
+ \n | Newline
+ \r | Carriage Return
+ \t | Tab
+ \\ | Backslash
+ \' | Single Quote
+ \" | Double Quote
+ \xNN | hexadecimal 8-bit character code (2 digits)
+ \uNNNN | hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)
+ \UNNNNNN | hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)
+
+Note that the maximum valid Unicode point is 0x10ffff.
+
##### Raw Strings
Raw string literals have no escapes and can span across multiple lines. To
@@ -283,25 +299,6 @@ start a raw string, use 'r"' or 'rc"' followed by unique bytes followed by '('.
To end a raw string, use ')' followed by the same unique bytes, followed by '"'.
-```
-Escape Name
-
-\xNN hexadecimal 8-bit character code (exactly 2 digits)
-\n Newline
-\r Carriage return
-\t Tab
-\\ Backslash
-\0 Null
-\' Single quote
-\" Double quote
-```
-
-### Unicode Escapes
-
- Escape | Name
-------------|-----------------------------------------------
- \u{NNNNNN} | hexadecimal 24-bit Unicode character code (up to 6 digits)
-
#### Numeric Literals
```
src/parser.cpp
@@ -219,7 +219,7 @@ static uint8_t parse_char_literal(ParseContext *pc, Token *token) {
return return_value;
}
-static int get_hex_digit(uint8_t c) {
+static uint32_t get_hex_digit(uint8_t c) {
switch (c) {
case '0': return 0;
case '1': return 1;
@@ -251,7 +251,7 @@ static int get_hex_digit(uint8_t c) {
case 'F':
return 15;
default:
- return -1;
+ return UINT32_MAX;
}
}
@@ -279,13 +279,17 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
StateEscape,
StateHex1,
StateHex2,
+ StateUnicode,
};
buf_resize(buf, 0);
+ int unicode_index;
+ int unicode_end;
+
State state = StatePre;
SrcPos pos = {token->start_line, token->start_column};
- int hex_value = 0;
+ uint32_t hex_value = 0;
for (int i = token->start_pos; i < token->end_pos - 1; i += 1) {
uint8_t c = *((uint8_t*)buf_ptr(pc->buf) + i);
@@ -348,17 +352,34 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
if (offset_map) offset_map->append(pos);
state = StateStart;
break;
+ case '\'':
+ buf_append_char(buf, '\'');
+ if (offset_map) offset_map->append(pos);
+ state = StateStart;
+ break;
case 'x':
state = StateHex1;
break;
+ case 'u':
+ state = StateUnicode;
+ unicode_index = 0;
+ unicode_end = 4;
+ hex_value = 0;
+ break;
+ case 'U':
+ state = StateUnicode;
+ unicode_index = 0;
+ unicode_end = 6;
+ hex_value = 0;
+ break;
default:
ast_error(pc, token, "invalid escape character");
}
break;
case StateHex1:
{
- int hex_digit = get_hex_digit(c);
- if (hex_digit == -1) {
+ uint32_t hex_digit = get_hex_digit(c);
+ if (hex_digit == UINT32_MAX) {
ast_error(pc, token, "invalid hex digit: '%c'", c);
}
hex_value = hex_digit * 16;
@@ -367,8 +388,8 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
}
case StateHex2:
{
- int hex_digit = get_hex_digit(c);
- if (hex_digit == -1) {
+ uint32_t hex_digit = get_hex_digit(c);
+ if (hex_digit == UINT32_MAX) {
ast_error(pc, token, "invalid hex digit: '%c'", c);
}
hex_value += hex_digit;
@@ -377,6 +398,47 @@ static void parse_string_literal(ParseContext *pc, Token *token, Buf *buf, bool
state = StateStart;
break;
}
+ case StateUnicode:
+ {
+ uint32_t hex_digit = get_hex_digit(c);
+ if (hex_digit == UINT32_MAX) {
+ ast_error(pc, token, "invalid hex digit: '%c'", c);
+ }
+ hex_value *= 16;
+ hex_value += hex_digit;
+ unicode_index += 1;
+ if (unicode_index >= unicode_end) {
+ if (hex_value <= 0x7f) {
+ // 00000000 00000000 00000000 0xxxxxxx
+ buf_append_char(buf, hex_value);
+ } else if (hex_value <= 0x7ff) {
+ // 00000000 00000000 00000xxx xx000000
+ buf_append_char(buf, (unsigned char)(0xc0 | (hex_value >> 6)));
+ // 00000000 00000000 00000000 00xxxxxx
+ buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+ } else if (hex_value <= 0xffff) {
+ // 00000000 00000000 xxxx0000 00000000
+ buf_append_char(buf, (unsigned char)(0xe0 | (hex_value >> 12)));
+ // 00000000 00000000 0000xxxx xx000000
+ buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
+ // 00000000 00000000 00000000 00xxxxxx
+ buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+ } else if (hex_value <= 0x10ffff) {
+ // 00000000 000xxx00 00000000 00000000
+ buf_append_char(buf, (unsigned char)(0xf0 | (hex_value >> 18)));
+ // 00000000 000000xx xxxx0000 00000000
+ buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 12) & 0x3f)));
+ // 00000000 00000000 0000xxxx xx000000
+ buf_append_char(buf, (unsigned char)(0x80 | ((hex_value >> 6) & 0x3f)));
+ // 00000000 00000000 00000000 00xxxxxx
+ buf_append_char(buf, (unsigned char)(0x80 | (hex_value & 0x3f)));
+ } else {
+ ast_error(pc, token, "unicode value out of range: %x", hex_value);
+ }
+ state = StateStart;
+ }
+ break;
+ }
}
if (c == '\n') {
pos.line += 1;
src/tokenizer.cpp
@@ -103,6 +103,21 @@
ALPHA: \
case '_'
+#define HEX_DIGIT \
+ 'a': \
+ case 'b': \
+ case 'c': \
+ case 'd': \
+ case 'e': \
+ case 'f': \
+ case 'A': \
+ case 'B': \
+ case 'C': \
+ case 'D': \
+ case 'E': \
+ case 'F': \
+ case DIGIT
+
const char * zig_keywords[] = {
"true", "false", "null", "fn", "return", "var", "const", "extern",
"pub", "export", "use", "if", "else", "goto", "asm",
@@ -132,11 +147,11 @@ enum TokenizeState {
TokenizeStateFloatExponentUnsigned, // "123.456e", "123e", "0x123p"
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
+ TokenizeStateStringEscape,
TokenizeStateRawString,
TokenizeStateRawStringContents,
TokenizeStateRawStringMaybeEnd,
TokenizeStateCharLiteral,
- TokenizeStateCharLiteralEscape,
TokenizeStateCharLiteralEnd,
TokenizeStateSawStar,
TokenizeStateSawSlash,
@@ -162,6 +177,7 @@ enum TokenizeState {
TokenizeStateSawDotDot,
TokenizeStateSawQuestionMark,
TokenizeStateSawAtSign,
+ TokenizeStateHex,
TokenizeStateError,
};
@@ -179,6 +195,7 @@ struct Tokenize {
int raw_string_id_start;
int raw_string_id_end;
int raw_string_id_cmp_pos;
+ int hex_chars_left;
};
__attribute__ ((format (printf, 2, 3)))
@@ -921,10 +938,63 @@ void tokenize(Buf *buf, Tokenization *out) {
case '\n':
tokenize_error(&t, "use raw string for multiline string literal");
break;
+ case '\\':
+ t.state = TokenizeStateStringEscape;
+ break;
default:
break;
}
break;
+ case TokenizeStateStringEscape:
+ switch (c) {
+ case 'x':
+ t.state = TokenizeStateHex;
+ t.hex_chars_left = 2;
+ break;
+ case 'u':
+ t.state = TokenizeStateHex;
+ t.hex_chars_left = 4;
+ break;
+ case 'U':
+ t.state = TokenizeStateHex;
+ t.hex_chars_left = 6;
+ break;
+ case 'n':
+ case 'r':
+ case '\\':
+ case 't':
+ case '\'':
+ case '"':
+ if (t.cur_tok->id == TokenIdCharLiteral) {
+ t.state = TokenizeStateCharLiteralEnd;
+ } else if (t.cur_tok->id == TokenIdStringLiteral) {
+ t.state = TokenizeStateString;
+ } else {
+ zig_unreachable();
+ }
+ break;
+ default:
+ tokenize_error(&t, "invalid character: '%c'", c);
+ }
+ break;
+ case TokenizeStateHex:
+ switch (c) {
+ case HEX_DIGIT:
+ t.hex_chars_left -= 1;
+ if (t.hex_chars_left == 0) {
+ if (t.cur_tok->id == TokenIdCharLiteral) {
+ t.state = TokenizeStateCharLiteralEnd;
+ } else if (t.cur_tok->id == TokenIdStringLiteral) {
+ t.state = TokenizeStateString;
+ } else {
+ zig_unreachable();
+ }
+ }
+ break;
+ default:
+ tokenize_error(&t, "invalid character: '%c'", c);
+ }
+ break;
case TokenizeStateRawString:
if (c == '(') {
t.raw_string_id_end = t.pos;
@@ -963,16 +1033,13 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateStart;
break;
case '\\':
- t.state = TokenizeStateCharLiteralEscape;
+ t.state = TokenizeStateStringEscape;
break;
default:
t.state = TokenizeStateCharLiteralEnd;
break;
}
break;
- case TokenizeStateCharLiteralEscape:
- t.state = TokenizeStateCharLiteralEnd;
- break;
case TokenizeStateCharLiteralEnd:
switch (c) {
case '\'':
@@ -1136,13 +1203,22 @@ void tokenize(Buf *buf, Tokenization *out) {
case TokenizeStateString:
tokenize_error(&t, "unterminated string");
break;
+ case TokenizeStateStringEscape:
+ case TokenizeStateHex:
+ if (t.cur_tok->id == TokenIdStringLiteral) {
+ tokenize_error(&t, "unterminated string");
+ } else if (t.cur_tok->id == TokenIdCharLiteral) {
+ tokenize_error(&t, "unterminated character literal");
+ } else {
+ zig_unreachable();
+ }
+ break;
case TokenizeStateRawString:
case TokenizeStateRawStringContents:
case TokenizeStateRawStringMaybeEnd:
tokenize_error(&t, "unterminated raw string");
break;
case TokenizeStateCharLiteral:
- case TokenizeStateCharLiteralEscape:
case TokenizeStateCharLiteralEnd:
tokenize_error(&t, "unterminated character literal");
break;
test/self_hosted.zig
@@ -1398,3 +1398,14 @@ fn test_take_address_of_parameter_noeval(f: f32) {
fn array_mult_operator() {
assert(str.eql("ab" ** 5, "ababababab"));
}
+
+#attribute("test")
+fn string_escapes() {
+ assert(str.eql("\"", "\x22"));
+ assert(str.eql("\'", "\x27"));
+ assert(str.eql("\n", "\x0a"));
+ assert(str.eql("\r", "\x0d"));
+ assert(str.eql("\t", "\x09"));
+ assert(str.eql("\\", "\x5c"));
+ assert(str.eql("\u1234\u0069", "\xe1\x88\xb4\x69"));
+}