Commit c3724a6e72
Changed files (1)
src-self-hosted
src-self-hosted/c_tokenizer.zig
@@ -0,0 +1,458 @@
+const std = @import("std");
+
+pub const TokenList = std.SegmentedList(CToken, 32);
+
+pub const CToken = struct {
+ id: Id,
+ bytes: []const u8,
+ num_lit_suffix: NumLitSuffix = undefined,
+
+ pub const Id = enum {
+ CharLit,
+ StrLit,
+ NumLitInt,
+ NumLitFloat,
+ Identifier,
+ Minus,
+ Slash,
+ LParen,
+ RParen,
+ Eof,
+ Dot,
+ Asterisk,
+ Bang,
+ Tilde,
+ Shl,
+ Lt,
+ };
+
+ pub const NumLitSuffix = enum {
+ None,
+ L,
+ U,
+ LU,
+ LL,
+ LLU,
+ };
+};
+
+pub fn tokenizeCMacro(tl: *TokenList, chars: [*]const u8) !void {
+ var index: usize = 0;
+ while (true) {
+ const tok = try next(chars[index..], &index);
+ tl.push(tok);
+ if (tok.id == .Eof)
+ return;
+ }
+}
+
+fn next(chars: [*]const u8, index: *usize) !CToken {
+ var state: enum {
+ Start,
+ GotLt,
+ ExpectChar,
+ ExpectEndQuot,
+ OpenComment,
+ Comment,
+ CommentStar,
+ Backslash,
+ String,
+ Identifier,
+ Decimal,
+ Octal,
+ GotZero,
+ Hex,
+ Float,
+ ExpSign,
+ FloatExp,
+ FloatExpFirst,
+ NumLitIntSuffixU,
+ NumLitIntSuffixL,
+ NumLitIntSuffixLL,
+ NumLitIntSuffixUL,
+ GotLt,
+ } = .Start;
+
+ var result = CToken{
+ .bytes = "",
+ .id = .Eof,
+ };
+ var begin_index: usize = 0;
+ var digits: u8 = 0;
+ var pre_escape = .Start;
+
+ for (chars[begin_index..]) |c, i| {
+ if (c == 0) {
+ switch (state) {
+ .Start => {
+ return result;
+ },
+ .Identifier,
+ .Decimal,
+ .Hex,
+ .Octal,
+ .GotZero,
+ .NumLitIntSuffixU,
+ .NumLitIntSuffixL,
+ .NumLitIntSuffixUL,
+ .NumLitIntSuffixLL,
+ .Float,
+ .FloatExp,
+ .GotLt,
+ => {
+ return result;
+ },
+ .ExpectChar,
+ .ExpectEndQuot,
+ .OpenComment,
+ .LineComment,
+ .Comment,
+ .CommentStar,
+ .Backslash,
+ .String,
+ .ExpSign,
+ .FloatExpFirst,
+ => return error.TokenizingFailed,
+ }
+ }
+ index.* += 1;
+ switch (state) {
+ .Start => {
+ switch (c) {
+ ' ', '\t', '\x0B', '\x0C' => {},
+ '\'' => {
+ state = .ExpectChar;
+ result.id = .CharLit;
+ begin_index = i;
+ },
+ '\"' => {
+ state = .String;
+ result.id = .StrLit;
+ begin_index = i;
+ },
+ '/' => {
+ state = .OpenComment;
+ },
+ '\\' => {
+ state = .Backslash;
+ },
+ '\n', '\r' => {
+ return result;
+ },
+ 'a'...'z', 'A'...'Z', '_' => {
+ state = .Identifier;
+ result.id = .Identifier;
+ begin_index = i;
+ },
+ '1'...'9' => {
+ state = .Decimal;
+ result.id = .NumLitInt;
+ begin_index = i;
+ },
+ '0' => {
+ state = .GotZero;
+ result.id = .NumLitInt;
+ begin_index = i;
+ },
+ '.' => {
+ result.id = .Dot;
+ return result;
+ },
+ '<' => {
+ result.id = .Lt;
+ state = .GotLt;
+ },
+ '(' => {
+ result.id = .LParen;
+ return result;
+ },
+ ')' => {
+ result.id = .RParen;
+ return result;
+ },
+ '*' => {
+ result.id = .Asterisk;
+ return result;
+ },
+ '-' => {
+ result.id = .Minus;
+ return result;
+ },
+ '!' => {
+ result.id = .Bang;
+ return result;
+ },
+ '~' => {
+ result.id = .Tilde;
+ return result;
+ },
+ else => return error.TokenizingFailed,
+ }
+ },
+ .GotLt => {
+ switch (c) {
+ '<' => {
+ result.id = .Shl;
+ return result;
+ },
+ else => {
+ return result;
+ },
+ }
+ },
+ .Float => {
+ switch (c) {
+ '.', '0'...'9' => {},
+ 'e', 'E' => {
+ state = .ExpSign;
+ },
+ 'f', 'F', 'l', 'L' => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ else => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ }
+ },
+ .ExpSign => {
+ switch (c) {
+ '+', '-' => {
+ state = .FloatExpFirst;
+ },
+ '0'...'9' => {
+ state = .FloatExp;
+ },
+ else => return error.TokenizingFailed,
+ }
+ },
+ .FloatExpFirst => {
+ switch (c) {
+ '0'...'9' => {
+ state = .FloatExp;
+ },
+ else => return error.TokenizingFailed,
+ }
+ },
+ .FloatExp => {
+ switch (c) {
+ '0'...'9' => {},
+ 'f', 'F', 'l', 'L' => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ else => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ }
+ },
+ .Decimal => {
+ switch (c) {
+ '0'...'9' => {},
+ '\'' => {},
+ 'u', 'U' => {
+ state = .NumLitIntSuffixU;
+ result.num_lit_suffix = .U;
+ },
+ 'l', 'L' => {
+ state = .NumLitIntSuffixL;
+ result.num_lit_suffix = .L;
+ },
+ '.' => {
+ result.id = .NumLitFloat;
+ state = .Float;
+ },
+ else => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ }
+ },
+ .GotZero => {
+ switch (c) {
+ 'x', 'X' => {
+ state = .Hex;
+ },
+ '.' => {
+ state = .Float;
+ result.id = .NumLitFloat;
+ },
+ 'l', 'L', 'u', 'U' => {
+ c -= 1;
+ state = .Decimal;
+ },
+ else => {
+ state = .Octal;
+ },
+ }
+ },
+ .Octal => {
+ switch (c) {
+ '0'...'7' => {},
+ '8', '9' => return error.TokenizingFailed,
+ else => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ }
+ },
+ .Hex => {
+ switch (c) {
+ '0'...'9', 'a'...'f', 'A'...'F' => {},
+
+ 'p', 'P' => {
+ result.id = .NumLitFloat;
+ state = .ExpSign;
+ },
+ 'u', 'U' => {
+ // marks the number literal as unsigned
+ state = .NumLitIntSuffixU;
+ result.num_lit_suffix = .U;
+ },
+ 'l', 'L' => {
+ // marks the number literal as long
+ state = .NumLitIntSuffixL;
+ result.num_lit_suffix = .L;
+ },
+ else => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ }
+ },
+ .NumLitIntSuffixU => {
+ switch (c) {
+ 'l', 'L' => {
+ result.num_lit_suffix = .LU;
+ state = .NumLitIntSuffixUL;
+ },
+ else => {
+ result.bytes = chars[begin_index..i - 1];
+ return result;
+ },
+ }
+ },
+ .NumLitIntSuffixL => {
+ switch (c) {
+ 'l', 'L' => {
+ result.num_lit_suffix = .LL;
+ state = .NumLitIntSuffixLL;
+ },
+ 'u', 'U' => {
+ result.num_lit_suffix = .LU;
+ result.bytes = chars[begin_index..i - 2];
+ return result;
+ },
+ else => {
+ result.bytes = chars[begin_index..i - 1];
+ return result;
+ },
+ }
+ },
+ .NumLitIntSuffixLL => {
+ switch (c) {
+ 'u', 'U' => {
+ result.num_lit_suffix = .LLU;
+ result.bytes = chars[begin_index..i - 3];
+ return result;
+ },
+ else => {
+ result.bytes = chars[begin_index..i - 2];
+ return result;
+ },
+ }
+ },
+ .NumLitIntSuffixUL => {
+ switch (c) {
+ 'l', 'L' => {
+ result.num_lit_suffix = .LLU;
+ result.bytes = chars[begin_index..i - 3];
+ return result;
+ },
+ else => {
+ result.bytes = chars[begin_index..i - 2];
+ return result;
+ },
+ }
+ },
+ .Identifier => {
+ switch (c) {
+ '_', 'a'...'z', 'A'...'Z', '0'...'9' => {},
+ else => {
+ result.bytes = chars[begin_index..i];
+ return result;
+ },
+ }
+ },
+ .String => {
+ switch (c) {
+ '\"' => {
+ result.bytes = chars[begin_index + 1 .. i];
+ return result;
+ },
+ else => {},
+ }
+ },
+ .ExpectChar => {
+ switch (c) {
+ '\'' => return error.TokenizingFailed,
+ else => {
+ state = .ExpectEndQuot;
+ },
+ }
+ },
+ .ExpectEndQuot => {
+ switch (c) {
+ '\'' => {
+ result.bytes = chars[begin_index + 1 .. i];
+ return result;
+ },
+ else => return error.TokenizingFailed,
+ }
+ },
+ .OpenComment => {
+ switch (c) {
+ '/' => {
+ return result;
+ },
+ '*' => {
+ state = .Comment;
+ },
+ else => {
+ result.id = .Slash;
+ return result;
+ },
+ }
+ },
+ .Comment => {
+ switch (c) {
+ '*' => {
+ state = .CommentStar;
+ },
+ else => {},
+ }
+ },
+ .CommentStar => {
+ switch (c) {
+ '/' => {
+ state = .Start;
+ },
+ else => {
+ state = .Comment;
+ },
+ }
+ },
+ .Backslash => {
+ switch (c) {
+ ' ', '\t', '\x0B', '\x0C' => {},
+ '\n', '\r' => {
+ state = .Start;
+ },
+ else => return error.TokenizingFailed,
+ }
+ },
+ }
+ }
+}