Commit c3724a6e72

Vexu <git@vexu.eu>
2019-12-15 09:30:38
translate-c-2 c tokenizer
1 parent e57e360
Changed files (1)
src-self-hosted
src-self-hosted/c_tokenizer.zig
@@ -0,0 +1,458 @@
+const std = @import("std");
+
+pub const TokenList = std.SegmentedList(CToken, 32);
+
+pub const CToken = struct {
+    id: Id,
+    bytes: []const u8,
+    num_lit_suffix: NumLitSuffix = undefined,
+
+    pub const Id = enum {
+        CharLit,
+        StrLit,
+        NumLitInt,
+        NumLitFloat,
+        Identifier,
+        Minus,
+        Slash,
+        LParen,
+        RParen,
+        Eof,
+        Dot,
+        Asterisk,
+        Bang,
+        Tilde,
+        Shl,
+        Lt,
+    };
+
+    pub const NumLitSuffix = enum {
+        None,
+        L,
+        U,
+        LU,
+        LL,
+        LLU,
+    };
+};
+
+pub fn tokenizeCMacro(tl: *TokenList, chars: [*]const u8) !void {
+    var index: usize = 0;
+    while (true) {
+        const tok = try next(chars[index..], &index);
+        tl.push(tok);
+        if (tok.id == .Eof)
+            return;
+    }
+}
+
+fn next(chars: [*]const u8, index: *usize) !CToken {
+    var state: enum {
+        Start,
+        GotLt,
+        ExpectChar,
+        ExpectEndQuot,
+        OpenComment,
+        Comment,
+        CommentStar,
+        Backslash,
+        String,
+        Identifier,
+        Decimal,
+        Octal,
+        GotZero,
+        Hex,
+        Float,
+        ExpSign,
+        FloatExp,
+        FloatExpFirst,
+        NumLitIntSuffixU,
+        NumLitIntSuffixL,
+        NumLitIntSuffixLL,
+        NumLitIntSuffixUL,
+        GotLt,
+    } = .Start;
+
+    var result = CToken{
+        .bytes = "",
+        .id = .Eof,
+    };
+    var begin_index: usize = 0;
+    var digits: u8 = 0;
+    var pre_escape = .Start;
+
+    for (chars[begin_index..]) |c, i| {
+        if (c == 0) {
+            switch (state) {
+                .Start => {
+                    return result;
+                },
+                .Identifier,
+                .Decimal,
+                .Hex,
+                .Octal,
+                .GotZero,
+                .NumLitIntSuffixU,
+                .NumLitIntSuffixL,
+                .NumLitIntSuffixUL,
+                .NumLitIntSuffixLL,
+                .Float,
+                .FloatExp,
+                .GotLt,
+                => {
+                    return result;
+                },
+                .ExpectChar,
+                .ExpectEndQuot,
+                .OpenComment,
+                .LineComment,
+                .Comment,
+                .CommentStar,
+                .Backslash,
+                .String,
+                .ExpSign,
+                .FloatExpFirst,
+                => return error.TokenizingFailed,
+            }
+        }
+        index.* += 1;
+        switch (state) {
+            .Start => {
+                switch (c) {
+                    ' ', '\t', '\x0B', '\x0C' => {},
+                    '\'' => {
+                        state = .ExpectChar;
+                        result.id = .CharLit;
+                        begin_index = i;
+                    },
+                    '\"' => {
+                        state = .String;
+                        result.id = .StrLit;
+                        begin_index = i;
+                    },
+                    '/' => {
+                        state = .OpenComment;
+                    },
+                    '\\' => {
+                        state = .Backslash;
+                    },
+                    '\n', '\r' => {
+                        return result;
+                    },
+                    'a'...'z', 'A'...'Z', '_' => {
+                        state = .Identifier;
+                        result.id = .Identifier;
+                        begin_index = i;
+                    },
+                    '1'...'9' => {
+                        state = .Decimal;
+                        result.id = .NumLitInt;
+                        begin_index = i;
+                    },
+                    '0' => {
+                        state = .GotZero;
+                        result.id = .NumLitInt;
+                        begin_index = i;
+                    },
+                    '.' => {
+                        result.id = .Dot;
+                        return result;
+                    },
+                    '<' => {
+                        result.id = .Lt;
+                        state = .GotLt;
+                    },
+                    '(' => {
+                        result.id = .LParen;
+                        return result;
+                    },
+                    ')' => {
+                        result.id = .RParen;
+                        return result;
+                    },
+                    '*' => {
+                        result.id = .Asterisk;
+                        return result;
+                    },
+                    '-' => {
+                        result.id = .Minus;
+                        return result;
+                    },
+                    '!' => {
+                        result.id = .Bang;
+                        return result;
+                    },
+                    '~' => {
+                        result.id = .Tilde;
+                        return result;
+                    },
+                    else => return error.TokenizingFailed,
+                }
+            },
+            .GotLt => {
+                switch (c) {
+                    '<' => {
+                        result.id = .Shl;
+                        return result;
+                    },
+                    else => {
+                        return result;
+                    },
+                }
+            },
+            .Float => {
+                switch (c) {
+                    '.', '0'...'9' => {},
+                    'e', 'E' => {
+                        state = .ExpSign;
+                    },
+                    'f', 'F', 'l', 'L' => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                }
+            },
+            .ExpSign => {
+                switch (c) {
+                    '+', '-' => {
+                        state = .FloatExpFirst;
+                    },
+                    '0'...'9' => {
+                        state = .FloatExp;
+                    },
+                    else => return error.TokenizingFailed,
+                }
+            },
+            .FloatExpFirst => {
+                switch (c) {
+                    '0'...'9' => {
+                        state = .FloatExp;
+                    },
+                    else => return error.TokenizingFailed,
+                }
+            },
+            .FloatExp => {
+                switch (c) {
+                    '0'...'9' => {},
+                    'f', 'F', 'l', 'L' => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                }
+            },
+            .Decimal => {
+                switch (c) {
+                    '0'...'9' => {},
+                    '\'' => {},
+                    'u', 'U' => {
+                        state = .NumLitIntSuffixU;
+                        result.num_lit_suffix = .U;
+                    },
+                    'l', 'L' => {
+                        state = .NumLitIntSuffixL;
+                        result.num_lit_suffix = .L;
+                    },
+                    '.' => {
+                        result.id = .NumLitFloat;
+                        state = .Float;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                }
+            },
+            .GotZero => {
+                switch (c) {
+                    'x', 'X' => {
+                        state = .Hex;
+                    },
+                    '.' => {
+                        state = .Float;
+                        result.id = .NumLitFloat;
+                    },
+                    'l', 'L', 'u', 'U' => {
+                        c -= 1;
+                        state = .Decimal;
+                    },
+                    else => {
+                        state = .Octal;
+                    },
+                }
+            },
+            .Octal => {
+                switch (c) {
+                    '0'...'7' => {},
+                    '8', '9' => return error.TokenizingFailed,
+                    else => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                }
+            },
+            .Hex => {
+                switch (c) {
+                    '0'...'9', 'a'...'f', 'A'...'F' => {},
+
+                    'p', 'P' => {
+                        result.id = .NumLitFloat;
+                        state = .ExpSign;
+                    },
+                    'u', 'U' => {
+                        // marks the number literal as unsigned
+                        state = .NumLitIntSuffixU;
+                        result.num_lit_suffix = .U;
+                    },
+                    'l', 'L' => {
+                        // marks the number literal as long
+                        state = .NumLitIntSuffixL;
+                        result.num_lit_suffix = .L;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                }
+            },
+            .NumLitIntSuffixU => {
+                switch (c) {
+                    'l', 'L' => {
+                        result.num_lit_suffix = .LU;
+                        state = .NumLitIntSuffixUL;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i - 1];
+                        return result;
+                    },
+                }
+            },
+            .NumLitIntSuffixL => {
+                switch (c) {
+                    'l', 'L' => {
+                        result.num_lit_suffix = .LL;
+                        state = .NumLitIntSuffixLL;
+                    },
+                    'u', 'U' => {
+                        result.num_lit_suffix = .LU;
+                        result.bytes = chars[begin_index..i - 2];
+                        return result;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i - 1];
+                        return result;
+                    },
+                }
+            },
+            .NumLitIntSuffixLL => {
+                switch (c) {
+                    'u', 'U' => {
+                        result.num_lit_suffix = .LLU;
+                        result.bytes = chars[begin_index..i - 3];
+                        return result;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i - 2];
+                        return result;
+                    },
+                }
+            },
+            .NumLitIntSuffixUL => {
+                switch (c) {
+                    'l', 'L' => {
+                        result.num_lit_suffix = .LLU;
+                        result.bytes = chars[begin_index..i - 3];
+                        return result;
+                    },
+                    else => {
+                        result.bytes = chars[begin_index..i - 2];
+                        return result;
+                    },
+                }
+            },
+            .Identifier => {
+                switch (c) {
+                    '_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
+                    else => {
+                        result.bytes = chars[begin_index..i];
+                        return result;
+                    },
+                }
+            },
+            .String => {
+                switch (c) {
+                    '\"' => {
+                        result.bytes = chars[begin_index + 1 .. i];
+                        return result;
+                    },
+                    else => {},
+                }
+            },
+            .ExpectChar => {
+                switch (c) {
+                    '\'' => return error.TokenizingFailed,
+                    else => {
+                        state = .ExpectEndQuot;
+                    },
+                }
+            },
+            .ExpectEndQuot => {
+                switch (c) {
+                    '\'' => {
+                        result.bytes = chars[begin_index + 1 .. i];
+                        return result;
+                    },
+                    else => return error.TokenizingFailed,
+                }
+            },
+            .OpenComment => {
+                switch (c) {
+                    '/' => {
+                        return result;
+                    },
+                    '*' => {
+                        state = .Comment;
+                    },
+                    else => {
+                        result.id = .Slash;
+                        return result;
+                    },
+                }
+            },
+            .Comment => {
+                switch (c) {
+                    '*' => {
+                        state = .CommentStar;
+                    },
+                    else => {},
+                }
+            },
+            .CommentStar => {
+                switch (c) {
+                    '/' => {
+                        state = .Start;
+                    },
+                    else => {
+                        state = .Comment;
+                    },
+                }
+            },
+            .Backslash => {
+                switch (c) {
+                    ' ', '\t', '\x0B', '\x0C' => {},
+                    '\n', '\r' => {
+                        state = .Start;
+                    },
+                    else => return error.TokenizingFailed,
+                }
+            },
+        }
+    }
+}