Commit e0046b737e
Changed files (2)
src-self-hosted
test
src-self-hosted/c_tokenizer.zig
@@ -74,69 +74,191 @@ fn zigifyEscapeSequences(allocator: *std.mem.Allocator, tok: CToken) !CToken {
}
} else return tok;
var bytes = try allocator.alloc(u8, tok.bytes.len * 2);
- var escape = false;
+ var state: enum {
+ Start,
+ Escape,
+ Hex,
+ Octal,
+ HexZero,
+ OctalZero,
+ } = .Start;
var i: usize = 0;
+ var count: u8 = 0;
+ var num: u8 = 0;
for (tok.bytes) |c| {
- if (escape) {
- switch (c) {
- 'n', 'r', 't', '\\', '\'', '\"', 'x' => {
- bytes[i] = c;
- },
- 'a' => {
- bytes[i] = 'x';
- i += 1;
- bytes[i] = '0';
- i += 1;
- bytes[i] = '7';
- },
- 'b' => {
- bytes[i] = 'x';
- i += 1;
- bytes[i] = '0';
- i += 1;
- bytes[i] = '8';
- },
- 'f' => {
- bytes[i] = 'x';
- i += 1;
- bytes[i] = '0';
- i += 1;
- bytes[i] = 'C';
- },
- 'v' => {
- bytes[i] = 'x';
- i += 1;
- bytes[i] = '0';
- i += 1;
- bytes[i] = 'B';
- },
- '?' => {
- i -= 1;
- bytes[i] = '?';
- },
- 'u', 'U' => {
- // TODO unicode escape sequences
- return error.TokenizingFailed;
- },
- '0'...'7' => {
- // TODO octal escape sequences
- return error.TokenizingFailed;
- },
- else => {
- // unknown escape sequence
- return error.TokenizingFailed;
- },
- }
- i += 1;
- escape = false;
- } else {
- if (c == '\\') {
- escape = true;
- }
- bytes[i] = c;
- i += 1;
+ switch (state) {
+ .Escape => {
+ switch (c) {
+ 'n', 'r', 't', '\\', '\'', '\"' => {
+ bytes[i] = c;
+ },
+ '0' => {
+ state = .OctalZero;
+ bytes[i] = 'x';
+ },
+ '1'...'7' => {
+ count += 1;
+ num *= 8;
+ num += c - '0';
+ state = .Octal;
+ bytes[i] = 'x';
+ },
+ 'x' => {
+ state = .HexZero;
+ bytes[i] = c;
+ },
+ 'a' => {
+ bytes[i] = 'x';
+ i += 1;
+ bytes[i] = '0';
+ i += 1;
+ bytes[i] = '7';
+ },
+ 'b' => {
+ bytes[i] = 'x';
+ i += 1;
+ bytes[i] = '0';
+ i += 1;
+ bytes[i] = '8';
+ },
+ 'f' => {
+ bytes[i] = 'x';
+ i += 1;
+ bytes[i] = '0';
+ i += 1;
+ bytes[i] = 'C';
+ },
+ 'v' => {
+ bytes[i] = 'x';
+ i += 1;
+ bytes[i] = '0';
+ i += 1;
+ bytes[i] = 'B';
+ },
+ '?' => {
+ i -= 1;
+ bytes[i] = '?';
+ },
+ 'u', 'U' => {
+ // TODO unicode escape sequences
+ return error.TokenizingFailed;
+ },
+ else => {
+ // unknown escape sequence
+ return error.TokenizingFailed;
+ },
+ }
+ i += 1;
+ if (state == .Escape)
+ state = .Start;
+ },
+ .Start => {
+ if (c == '\\') {
+ state = .Escape;
+ }
+ bytes[i] = c;
+ i += 1;
+ },
+ .HexZero => {
+ switch (c) {
+ '0' => { continue; },
+ '1'...'9' => {
+ count += 1;
+ num *= 16;
+ num += c - '0';
+ },
+ 'a'...'f' => {
+ count += 1;
+ num *= 16;
+ num += c - 'a' + 10;
+ },
+ 'A'...'F' => {
+ count += 1;
+ num *= 16;
+ num += c - 'A' + 10;
+ },
+ else => {},
+ }
+ state = .Hex;
+ },
+ .Hex => {
+ switch (c) {
+ '0'...'9' => {
+ count += 1;
+ num *= 16;
+ num += c - '0';
+ if (count < 2)
+ continue;
+ },
+ 'a'...'f' => {
+ count += 1;
+ num *= 16;
+ num += c - 'a' + 10;
+ if (count < 2)
+ continue;
+ },
+ 'A'...'F' => {
+ count += 1;
+ num *= 16;
+ num += c - 'A' + 10;
+ if (count < 2)
+ continue;
+ },
+ else => {},
+ }
+ i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2});
+ switch (c) {
+ '\\' => state = .Escape,
+ '0'...'9', 'a'...'f','A'...'F' => state = .Start,
+ else => {
+ state = .Start;
+ bytes[i] = c;
+ i += 1;
+ },
+ }
+ count = 0;
+ num = 0;
+ },
+ .OctalZero => {
+ switch (c) {
+ '0' => { continue; },
+ '1'...'7' => {
+ count += 1;
+ num *= 8;
+ num += c - '0';
+ },
+ else => {},
+ }
+ state = .Octal;
+ },
+ .Octal => {
+ switch (c) {
+ '0'...'7' => {
+ count += 1;
+ num *= 8;
+ num += c - '0';
+ if (count < 3)
+ continue;
+ },
+ else => {},
+ }
+ i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2});
+ switch (c) {
+ '\\' => state = .Escape,
+ '0'...'7' => state = .Start,
+ else => {
+ state = .Start;
+ bytes[i] = c;
+ i += 1;
+ },
+ }
+ count = 0;
+ num = 0;
+ },
}
}
+ if (state == .Hex or state == .Octal)
+ i += std.fmt.formatIntBuf(bytes[i..], num, 16, false, std.fmt.FormatOptions{.fill = '0', .width = 2});
return CToken{
.id = tok.id,
.bytes = bytes[0..i],
@@ -666,3 +788,25 @@ test "tokenize macro" {
expect(it.next() == null);
tl.shrink(0);
}
+
+test "escape sequences" {
+ var buf: [1024]u8 = undefined;
+ var alloc = std.heap.FixedBufferAllocator.init(buf[0..]);
+ const a = &alloc.allocator;
+ expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+ .id = .StrLit,
+ .bytes = "\\x0077",
+ })).bytes, "\\x77"));
+ expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+ .id = .StrLit,
+ .bytes = "\\00245",
+ })).bytes, "\\xa5"));
+ expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+ .id = .StrLit,
+ .bytes = "\\x0077abc",
+ })).bytes, "\\x77abc"));
+ expect(std.mem.eql(u8, (try zigifyEscapeSequences(a, .{
+ .id = .StrLit,
+ .bytes = "\\045abc",
+ })).bytes, "\\x25abc"));
+}
test/translate_c.zig
@@ -1089,13 +1089,16 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
\\}
});
- cases.add_2("macro escape sequences",
+ cases.add_2("macro defines string literal with hex",
\\#define FOO "aoeu\xab derp"
- \\#define FOO2 "aoeu\a derp"
+ \\#define FOO2 "aoeu\x0007a derp"
+ \\#define FOO_CHAR '\xfF'
, &[_][]const u8{
\\pub const FOO = "aoeu\xab derp";
,
- \\pub const FOO2 = "aoeu\x07 derp";
+ \\pub const FOO2 = "aoeu\x7a derp";
+ ,
+ \\pub const FOO_CHAR = '\xff';
});
cases.add_2("variable aliasing",
@@ -2157,30 +2160,16 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
\\}
});
- /////////////// Cases for only stage1 which are TODO items for stage2 ////////////////
-
- cases.add("macro defines string literal with hex",
- \\#define FOO "aoeu\xab derp"
- \\#define FOO2 "aoeu\x0007a derp"
- \\#define FOO_CHAR '\xfF'
- , &[_][]const u8{
- \\pub const FOO = "aoeu\xab derp";
- ,
- \\pub const FOO2 = "aoeuz derp";
- ,
- \\pub const FOO_CHAR = 255;
- });
-
- cases.add("macro defines string literal with octal",
+ cases.add_2("macro defines string literal with octal",
\\#define FOO "aoeu\023 derp"
\\#define FOO2 "aoeu\0234 derp"
\\#define FOO_CHAR '\077'
, &[_][]const u8{
\\pub const FOO = "aoeu\x13 derp";
,
- \\pub const FOO2 = "aoeu\x134 derp";
+ \\pub const FOO2 = "aoeu\x9c derp";
,
- \\pub const FOO_CHAR = 63;
+ \\pub const FOO_CHAR = '\x3f';
});
/////////////// Cases for only stage1 because stage2 behavior is better ////////////////
@@ -3111,4 +3100,28 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
\\ _ = baz.?();
\\}
});
+
+ cases.add("macro defines string literal with hex",
+ \\#define FOO "aoeu\xab derp"
+ \\#define FOO2 "aoeu\x0007a derp"
+ \\#define FOO_CHAR '\xfF'
+ , &[_][]const u8{
+ \\pub const FOO = "aoeu\xab derp";
+ ,
+ \\pub const FOO2 = "aoeuz derp";
+ ,
+ \\pub const FOO_CHAR = 255;
+ });
+
+ cases.add("macro defines string literal with octal",
+ \\#define FOO "aoeu\023 derp"
+ \\#define FOO2 "aoeu\0234 derp"
+ \\#define FOO_CHAR '\077'
+ , &[_][]const u8{
+ \\pub const FOO = "aoeu\x13 derp";
+ ,
+ \\pub const FOO2 = "aoeu\x134 derp";
+ ,
+ \\pub const FOO_CHAR = 63;
+ });
}