Commit d75697a6a3

Vexu <git@vexu.eu>
2020-01-04 01:44:23
std-c tokenizer keywords
1 parent 26bf410
Changed files (1)
lib
lib/std/c/tokenizer.zig
@@ -1,5 +1,5 @@
 const std = @import("std");
-const expect = std.testing.expect;
+const mem = std.mem;
 
 pub const Source = struct {
     buffer: []const u8,
@@ -7,11 +7,19 @@ pub const Source = struct {
 };
 
 pub const Token = struct {
-    id: union(enum) {
+    id: Id,
+    start: usize,
+    end: usize,
+    source: *Source,
+
+    pub const Id = union(enum) {
         Invalid,
         Eof,
         Nl,
         Identifier,
+
+        /// special case for #include <...>
+        MacroString,
         StringLiteral: StrKind,
         CharLiteral: StrKind,
         IntegerLiteral: NumSuffix,
@@ -68,10 +76,160 @@ pub const Token = struct {
         MultiLineComment,
         Hash,
         HashHash,
-    },
-    start: usize,
-    end: usize,
-    source: *Source,
+
+        Keyword_auto,
+        Keyword_break,
+        Keyword_case,
+        Keyword_char,
+        Keyword_const,
+        Keyword_continue,
+        Keyword_default,
+        Keyword_do,
+        Keyword_double,
+        Keyword_else,
+        Keyword_enum,
+        Keyword_extern,
+        Keyword_float,
+        Keyword_for,
+        Keyword_goto,
+        Keyword_if,
+        Keyword_int,
+        Keyword_long,
+        Keyword_register,
+        Keyword_return,
+        Keyword_short,
+        Keyword_signed,
+        Keyword_sizeof,
+        Keyword_static,
+        Keyword_struct,
+        Keyword_switch,
+        Keyword_typedef,
+        Keyword_union,
+        Keyword_unsigned,
+        Keyword_void,
+        Keyword_volatile,
+        Keyword_while,
+
+        // ISO C99
+        Keyword_bool,
+        Keyword_complex,
+        Keyword_imaginary,
+        Keyword_inline,
+        Keyword_restrict,
+
+        // ISO C11
+        Keyword_alignas,
+        Keyword_alignof,
+        Keyword_atomic,
+        Keyword_generic,
+        Keyword_noreturn,
+        Keyword_static_assert,
+        Keyword_thread_local,
+
+        // Preprocessor
+        Keyword_include,
+        Keyword_define,
+        Keyword_ifdef,
+        Keyword_ifndef,
+        Keyword_error,
+        Keyword_pragma,
+    };
+
+    pub const Keyword = struct {
+        bytes: []const u8,
+        id: Id,
+        hash: u32,
+
+        fn init(bytes: []const u8, id: Id) Keyword {
+            @setEvalBranchQuota(2000);
+            return .{
+                .bytes = bytes,
+                .id = id,
+                .hash = std.hash_map.hashString(bytes),
+            };
+        }
+    };
+
+    // TODO extensions
+    pub const keywords = [_]Keyword{
+        Keyword.init("auto", .Keyword_auto),
+        Keyword.init("break", .Keyword_break),
+        Keyword.init("case", .Keyword_case),
+        Keyword.init("char", .Keyword_char),
+        Keyword.init("const", .Keyword_const),
+        Keyword.init("continue", .Keyword_continue),
+        Keyword.init("default", .Keyword_default),
+        Keyword.init("do", .Keyword_do),
+        Keyword.init("double", .Keyword_double),
+        Keyword.init("else", .Keyword_else),
+        Keyword.init("enum", .Keyword_enum),
+        Keyword.init("extern", .Keyword_extern),
+        Keyword.init("float", .Keyword_float),
+        Keyword.init("for", .Keyword_for),
+        Keyword.init("goto", .Keyword_goto),
+        Keyword.init("if", .Keyword_if),
+        Keyword.init("int", .Keyword_int),
+        Keyword.init("long", .Keyword_long),
+        Keyword.init("register", .Keyword_register),
+        Keyword.init("return", .Keyword_return),
+        Keyword.init("short", .Keyword_short),
+        Keyword.init("signed", .Keyword_signed),
+        Keyword.init("sizeof", .Keyword_sizeof),
+        Keyword.init("static", .Keyword_static),
+        Keyword.init("struct", .Keyword_struct),
+        Keyword.init("switch", .Keyword_switch),
+        Keyword.init("typedef", .Keyword_typedef),
+        Keyword.init("union", .Keyword_union),
+        Keyword.init("unsigned", .Keyword_unsigned),
+        Keyword.init("void", .Keyword_void),
+        Keyword.init("volatile", .Keyword_volatile),
+        Keyword.init("while", .Keyword_while),
+
+        // ISO C99
+        Keyword.init("_Bool", .Keyword_bool),
+        Keyword.init("_Complex", .Keyword_complex),
+        Keyword.init("_Imaginary", .Keyword_imaginary),
+        Keyword.init("inline", .Keyword_inline),
+        Keyword.init("restrict", .Keyword_restrict),
+
+        // ISO C11
+        Keyword.init("_Alignas", .Keyword_alignas),
+        Keyword.init("_Alignof", .Keyword_alignof),
+        Keyword.init("_Atomic", .Keyword_atomic),
+        Keyword.init("_Generic", .Keyword_generic),
+        Keyword.init("_Noreturn", .Keyword_noreturn),
+        Keyword.init("_Static_assert", .Keyword_static_assert),
+        Keyword.init("_Thread_local", .Keyword_thread_local),
+
+        // Preprocessor
+        Keyword.init("include", .Keyword_include),
+        Keyword.init("define", .Keyword_define),
+        Keyword.init("ifdef", .Keyword_ifdef),
+        Keyword.init("ifndef", .Keyword_ifndef),
+        Keyword.init("error", .Keyword_error),
+        Keyword.init("pragma", .Keyword_pragma),
+    };
+
+    // TODO perfect hash at comptime
+    pub fn getKeyword(bytes: []const u8, macro: bool) ?Id {
+        var hash = std.hash_map.hashString(bytes);
+        for (keywords) |kw| {
+            if (kw.hash == hash and mem.eql(u8, kw.bytes, bytes)) {
+                switch (kw.id) {
+                    .Keyword_include,
+                    .Keyword_define,
+                    .Keyword_ifdef,
+                    .Keyword_ifndef,
+                    .Keyword_error,
+                    .Keyword_pragma,
+                    => if (!macro) return null,
+                    else => {},
+                }
+                return kw.id;
+            }
+        }
+        return null;
+    }
 
     pub const NumSuffix = enum {
         None,
@@ -95,6 +253,7 @@ pub const Token = struct {
 pub const Tokenizer = struct {
     source: *Source,
     index: usize = 0,
+    prev_tok_id: @TagType(Token.Id),
 
     pub fn next(self: *Tokenizer) Token {
         const start_index = self.index;
@@ -124,6 +283,9 @@ pub const Tokenizer = struct {
             Percent,
             Asterisk,
             Plus,
+
+            /// special case for #include <...>
+            MacroString,
             AngleBracketLeft,
             AngleBracketAngleBracketLeft,
             AngleBracketRight,
@@ -189,7 +351,6 @@ pub const Tokenizer = struct {
                     },
                     'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => {
                         state = .Identifier;
-                        result.id = .Identifier;
                     },
                     '=' => {
                         state = .Equal;
@@ -250,7 +411,10 @@ pub const Tokenizer = struct {
                         state = .Plus;
                     },
                     '<' => {
-                        state = .AngleBracketLeft;
+                        if (self.prev_tok_id == .Keyword_include)
+                            state = .MacroString
+                        else
+                            state = .AngleBracketLeft;
                     },
                     '>' => {
                         state = .AngleBracketRight;
@@ -442,7 +606,7 @@ pub const Tokenizer = struct {
                 .Identifier => switch (c) {
                     'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
                     else => {
-                        result.id = .Identifier;
+                        result.id = Token.getKeyword(self.source.buffer[result.start..self.index], self.prev_tok_id == .Hash) orelse .Identifier;
                         break;
                     },
                 },
@@ -522,6 +686,14 @@ pub const Tokenizer = struct {
                         break;
                     },
                 },
+                .MacroString => switch (c) {
+                    '>' => {
+                        result.id = .MacroString;
+                        self.index += 1;
+                        break;
+                    },
+                    else => {},
+                },
                 .AngleBracketLeft => switch (c) {
                     '<' => {
                         state = .AngleBracketAngleBracketLeft;
@@ -859,7 +1031,7 @@ pub const Tokenizer = struct {
             switch (state) {
                 .Start => {},
                 .u, .u8, .U, .L, .Identifier => {
-                    result.id = .Identifier;
+                    result.id = Token.getKeyword(self.source.buffer[result.start..self.index], self.prev_tok_id == .Hash) orelse .Identifier;
                 },
 
                 .Cr,
@@ -876,6 +1048,7 @@ pub const Tokenizer = struct {
                 .FloatFractionHex,
                 .FloatExponent,
                 .FloatExponentDigits,
+                .MacroString,
                 => result.id = .Invalid,
 
                 .IntegerLiteralOct,
@@ -910,6 +1083,7 @@ pub const Tokenizer = struct {
             }
         }
 
+        self.prev_tok_id = result.id;
         result.end = self.index;
         return result;
     }