Commit 35362f8137

Andrew Kelley <superjoe30@gmail.com>
2016-04-22 00:48:13
better parsing of C macros
See #88
1 parent a380b80
src/c_tokenizer.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2016 Andrew Kelley
+ *
+ * This file is part of zig, which is MIT licensed.
+ * See http://opensource.org/licenses/MIT
+ */
+
+#include "c_tokenizer.hpp"
+#include <inttypes.h>
+
+#define WHITESPACE_EXCEPT_N \
+         ' ': \
+    case '\t': \
+    case '\v': \
+    case '\f'
+
+#define DIGIT_NON_ZERO \
+         '1': \
+    case '2': \
+    case '3': \
+    case '4': \
+    case '5': \
+    case '6': \
+    case '7': \
+    case '8': \
+    case '9'
+
+#define DIGIT \
+         '0': \
+    case DIGIT_NON_ZERO
+
+#define ALPHA \
+         'a': \
+    case 'b': \
+    case 'c': \
+    case 'd': \
+    case 'e': \
+    case 'f': \
+    case 'g': \
+    case 'h': \
+    case 'i': \
+    case 'j': \
+    case 'k': \
+    case 'l': \
+    case 'm': \
+    case 'n': \
+    case 'o': \
+    case 'p': \
+    case 'q': \
+    case 'r': \
+    case 's': \
+    case 't': \
+    case 'u': \
+    case 'v': \
+    case 'w': \
+    case 'x': \
+    case 'y': \
+    case 'z': \
+    case 'A': \
+    case 'B': \
+    case 'C': \
+    case 'D': \
+    case 'E': \
+    case 'F': \
+    case 'G': \
+    case 'H': \
+    case 'I': \
+    case 'J': \
+    case 'K': \
+    case 'L': \
+    case 'M': \
+    case 'N': \
+    case 'O': \
+    case 'P': \
+    case 'Q': \
+    case 'R': \
+    case 'S': \
+    case 'T': \
+    case 'U': \
+    case 'V': \
+    case 'W': \
+    case 'X': \
+    case 'Y': \
+    case 'Z'
+
+#define IDENT_START \
+    ALPHA: \
+    case '_'
+
+#define IDENT \
+    IDENT_START: \
+    case DIGIT
+
+
+static void begin_token(CTokenize *ctok, CTokId id) {
+    assert(ctok->cur_tok == nullptr);
+    ctok->tokens.add_one();
+    ctok->cur_tok = &ctok->tokens.last();
+    ctok->cur_tok->id = id;
+
+    switch (id) {
+        case CTokIdStrLit:
+            memset(&ctok->cur_tok->data.str_lit, 0, sizeof(Buf));
+            buf_resize(&ctok->cur_tok->data.str_lit, 0);
+            break;
+        case CTokIdSymbol:
+            memset(&ctok->cur_tok->data.symbol, 0, sizeof(Buf));
+            buf_resize(&ctok->cur_tok->data.symbol, 0);
+            break;
+        case CTokIdCharLit:
+        case CTokIdNumLitInt:
+        case CTokIdNumLitFloat:
+        case CTokIdMinus:
+            break;
+    }
+}
+
+static void end_token(CTokenize *ctok) {
+    ctok->cur_tok = nullptr;
+}
+
+static void mark_error(CTokenize *ctok) {
+    ctok->error = true;
+}
+
+static void add_char(CTokenize *ctok, uint8_t c) {
+    assert(ctok->cur_tok);
+    if (ctok->cur_tok->id == CTokIdCharLit) {
+        ctok->cur_tok->data.char_lit = c;
+        ctok->state = CTokStateExpectEndQuot;
+    } else if (ctok->cur_tok->id == CTokIdStrLit) {
+        buf_append_char(&ctok->cur_tok->data.str_lit, c);
+        ctok->state = CTokStateString;
+    } else {
+        zig_unreachable();
+    }
+}
+
+static void hex_digit(CTokenize *ctok, uint8_t value) {
+    // TODO @mul_with_overflow
+    ctok->cur_tok->data.num_lit_int *= 16;
+    // TODO @add_with_overflow
+    ctok->cur_tok->data.num_lit_int += value;
+
+    static const uint8_t hex_digit[] = "0123456789abcdef";
+    buf_append_char(&ctok->buf, hex_digit[value]);
+}
+
+static void end_float(CTokenize *ctok) {
+    // TODO detect errors, overflow, and underflow
+    double value = strtod(buf_ptr(&ctok->buf), nullptr);
+
+    ctok->cur_tok->data.num_lit_float = value;
+
+    end_token(ctok);
+    ctok->state = CTokStateStart;
+
+}
+
+void tokenize_c_macro(CTokenize *ctok, const uint8_t *c) {
+    ctok->tokens.resize(0);
+    ctok->state = CTokStateStart;
+    ctok->error = false;
+    ctok->cur_tok = nullptr;
+
+    buf_resize(&ctok->buf, 0);
+
+    for (; *c; c += 1) {
+        switch (ctok->state) {
+            case CTokStateStart:
+                switch (*c) {
+                    case WHITESPACE_EXCEPT_N:
+                        break;
+                    case '\'':
+                        ctok->state = CTokStateExpectChar;
+                        begin_token(ctok, CTokIdCharLit);
+                        break;
+                    case '\"':
+                        ctok->state = CTokStateString;
+                        begin_token(ctok, CTokIdStrLit);
+                        break;
+                    case '/':
+                        ctok->state = CTokStateOpenComment;
+                        break;
+                    case '\\':
+                        ctok->state = CTokStateBackslash;
+                        break;
+                    case '\n':
+                        goto found_end_of_macro;
+                    case IDENT_START:
+                        ctok->state = CTokStateIdentifier;
+                        begin_token(ctok, CTokIdSymbol);
+                        buf_append_char(&ctok->cur_tok->data.symbol, *c);
+                        break;
+                    case DIGIT_NON_ZERO:
+                        ctok->state = CTokStateDecimal;
+                        ctok->unsigned_suffix = false;
+                        ctok->long_suffix = false;
+                        begin_token(ctok, CTokIdNumLitInt);
+                        ctok->cur_tok->data.num_lit_int = *c - '0';
+                        buf_resize(&ctok->buf, 0);
+                        buf_append_char(&ctok->buf, *c);
+                        break;
+                    case '0':
+                        ctok->state = CTokStateGotZero;
+                        ctok->unsigned_suffix = false;
+                        ctok->long_suffix = false;
+                        begin_token(ctok, CTokIdNumLitInt);
+                        ctok->cur_tok->data.num_lit_int = 0;
+                        buf_resize(&ctok->buf, 0);
+                        buf_append_char(&ctok->buf, '0');
+                        break;
+                    case '.':
+                        begin_token(ctok, CTokIdNumLitFloat);
+                        ctok->state = CTokStateFloat;
+                        buf_init_from_str(&ctok->buf, "0.");
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+            case CTokStateFloat:
+                switch (*c) {
+                    case 'e':
+                    case 'E':
+                        buf_append_char(&ctok->buf, 'e');
+                        ctok->state = CTokStateExpSign;
+                        break;
+                    case 'f':
+                    case 'F':
+                    case 'l':
+                    case 'L':
+                        end_float(ctok);
+                        break;
+                    case DIGIT:
+                        buf_append_char(&ctok->buf, *c);
+                        break;
+                    default:
+                        c -= 1;
+                        end_float(ctok);
+                        continue;
+                }
+                break;
+            case CTokStateExpSign:
+                switch (*c) {
+                    case '+':
+                    case '-':
+                        ctok->state = CTokStateFloatExpFirst;
+                        buf_append_char(&ctok->buf, *c);
+                        break;
+                    case DIGIT:
+                        ctok->state = CTokStateFloatExp;
+                        buf_append_char(&ctok->buf, *c);
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+            case CTokStateFloatExpFirst:
+                switch (*c) {
+                    case DIGIT:
+                        buf_append_char(&ctok->buf, *c);
+                        ctok->state = CTokStateFloatExp;
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+            case CTokStateFloatExp:
+                switch (*c) {
+                    case DIGIT:
+                        buf_append_char(&ctok->buf, *c);
+                        break;
+                    case 'f':
+                    case 'F':
+                    case 'l':
+                    case 'L':
+                        end_float(ctok);
+                        break;
+                    default:
+                        c -= 1;
+                        end_float(ctok);
+                        continue;
+                }
+                break;
+            case CTokStateDecimal:
+                switch (*c) {
+                    case DIGIT:
+                        buf_append_char(&ctok->buf, *c);
+
+                        // TODO @mul_with_overflow
+                        ctok->cur_tok->data.num_lit_int *= 10;
+                        // TODO @add_with_overflow
+                        ctok->cur_tok->data.num_lit_int += *c - '0';
+                        break;
+                    case '\'':
+                        break;
+                    case 'u':
+                    case 'U':
+                        ctok->unsigned_suffix = true;
+                        ctok->state = CTokStateIntSuffix;
+                        break;
+                    case 'l':
+                    case 'L':
+                        ctok->long_suffix = true;
+                        ctok->state = CTokStateIntSuffixLong;
+                        break;
+                    case '.':
+                        buf_append_char(&ctok->buf, '.');
+                        ctok->cur_tok->id = CTokIdNumLitFloat;
+                        ctok->state = CTokStateFloat;
+                        break;
+                    default:
+                        c -= 1;
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        continue;
+                }
+                break;
+            case CTokStateIntSuffix:
+                switch (*c) {
+                    case 'l':
+                    case 'L':
+                        if (ctok->long_suffix) {
+                            return mark_error(ctok);
+                        }
+                        ctok->long_suffix = true;
+                        ctok->state = CTokStateIntSuffixLong;
+                        break;
+                    case 'u':
+                    case 'U':
+                        if (ctok->unsigned_suffix) {
+                            return mark_error(ctok);
+                        }
+                        ctok->unsigned_suffix = true;
+                        break;
+                    default:
+                        c -= 1;
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        continue;
+                }
+                break;
+            case CTokStateIntSuffixLong:
+                switch (*c) {
+                    case 'l':
+                    case 'L':
+                        ctok->state = CTokStateIntSuffix;
+                        break;
+                    case 'u':
+                    case 'U':
+                        if (ctok->unsigned_suffix) {
+                            return mark_error(ctok);
+                        }
+                        ctok->unsigned_suffix = true;
+                        break;
+                    default:
+                        c -= 1;
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        continue;
+                }
+                break;
+            case CTokStateGotZero:
+                switch (*c) {
+                    case 'x':
+                    case 'X':
+                        ctok->state = CTokStateHex;
+                        break;
+                    case '.':
+                        ctok->state = CTokStateFloat;
+                        ctok->cur_tok->id = CTokIdNumLitFloat;
+                        buf_append_char(&ctok->buf, '.');
+                        break;
+                    default:
+                        c -= 1;
+                        ctok->state = CTokStateOctal;
+                        continue;
+                }
+                break;
+            case CTokStateOctal:
+                switch (*c) {
+                    case '0':
+                    case '1':
+                    case '2':
+                    case '3':
+                    case '4':
+                    case '5':
+                    case '6':
+                    case '7':
+                        // TODO @mul_with_overflow
+                        ctok->cur_tok->data.num_lit_int *= 8;
+                        // TODO @add_with_overflow
+                        ctok->cur_tok->data.num_lit_int += *c - '0';
+                        break;
+                    case '8':
+                    case '9':
+                        return mark_error(ctok);
+                    case '\'':
+                        break;
+                    default:
+                        c -= 1;
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        continue;
+                }
+                break;
+            case CTokStateHex:
+                switch (*c) {
+                    case '0':
+                        hex_digit(ctok, 0);
+                        break;
+                    case '1':
+                        hex_digit(ctok, 1);
+                        break;
+                    case '2':
+                        hex_digit(ctok, 2);
+                        break;
+                    case '3':
+                        hex_digit(ctok, 3);
+                        break;
+                    case '4':
+                        hex_digit(ctok, 4);
+                        break;
+                    case '5':
+                        hex_digit(ctok, 5);
+                        break;
+                    case '6':
+                        hex_digit(ctok, 6);
+                        break;
+                    case '7':
+                        hex_digit(ctok, 7);
+                        break;
+                    case '8':
+                        hex_digit(ctok, 8);
+                        break;
+                    case '9':
+                        hex_digit(ctok, 9);
+                        break;
+                    case 'a':
+                    case 'A':
+                        hex_digit(ctok, 10);
+                        break;
+                    case 'b':
+                    case 'B':
+                        hex_digit(ctok, 11);
+                        break;
+                    case 'c':
+                    case 'C':
+                        hex_digit(ctok, 12);
+                        break;
+                    case 'd':
+                    case 'D':
+                        hex_digit(ctok, 13);
+                        break;
+                    case 'e':
+                    case 'E':
+                        hex_digit(ctok, 14);
+                        break;
+                    case 'f':
+                    case 'F':
+                        hex_digit(ctok, 15);
+                        break;
+                    case 'p':
+                    case 'P':
+                        ctok->cur_tok->id = CTokIdNumLitFloat;
+                        ctok->state = CTokStateExpSign;
+                        break;
+                    default:
+                        c -= 1;
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        continue;
+                }
+                break;
+            case CTokStateIdentifier:
+                switch (*c) {
+                    case IDENT:
+                        buf_append_char(&ctok->cur_tok->data.symbol, *c);
+                        break;
+                    default:
+                        c -= 1;
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        continue;
+                }
+                break;
+            case CTokStateString:
+                switch (*c) {
+                    case '\\':
+                        ctok->state = CTokStateCharEscape;
+                        break;
+                    case '\"':
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        break;
+                    default:
+                        buf_append_char(&ctok->cur_tok->data.str_lit, *c);
+                }
+                break;
+            case CTokStateExpectChar:
+                switch (*c) {
+                    case '\\':
+                        ctok->state = CTokStateCharEscape;
+                        break;
+                    case '\'':
+                        return mark_error(ctok);
+                    default:
+                        ctok->cur_tok->data.char_lit = *c;
+                        ctok->state = CTokStateExpectEndQuot;
+                }
+                break;
+            case CTokStateCharEscape:
+                switch (*c) {
+                    case '\'':
+                    case '"':
+                    case '?':
+                    case '\\':
+                        add_char(ctok, *c);
+                        break;
+                    case 'a':
+                        add_char(ctok, '\a');
+                        break;
+                    case 'b':
+                        add_char(ctok, '\b');
+                        break;
+                    case 'f':
+                        add_char(ctok, '\f');
+                        break;
+                    case 'n':
+                        add_char(ctok, '\n');
+                        break;
+                    case 'r':
+                        add_char(ctok, '\r');
+                        break;
+                    case 't':
+                        add_char(ctok, '\t');
+                        break;
+                    case 'v':
+                        add_char(ctok, '\v');
+                        break;
+                    case DIGIT:
+                        zig_panic("TODO octal");
+                        break;
+                    case 'x':
+                        zig_panic("TODO hex");
+                        break;
+                    case 'u':
+                        zig_panic("TODO unicode");
+                        break;
+                    case 'U':
+                        zig_panic("TODO Unicode");
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+            case CTokStateExpectEndQuot:
+                switch (*c) {
+                    case '\'':
+                        end_token(ctok);
+                        ctok->state = CTokStateStart;
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+            case CTokStateOpenComment:
+                switch (*c) {
+                    case '/':
+                        ctok->state = CTokStateLineComment;
+                        break;
+                    case '*':
+                        ctok->state = CTokStateComment;
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+            case CTokStateLineComment:
+                if (*c == '\n') {
+                    ctok->state = CTokStateStart;
+                    goto found_end_of_macro;
+                }
+                break;
+            case CTokStateComment:
+                switch (*c) {
+                    case '*':
+                        ctok->state = CTokStateCommentStar;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+            case CTokStateCommentStar:
+                switch (*c) {
+                    case '/':
+                        ctok->state = CTokStateStart;
+                        break;
+                    case '*':
+                        break;
+                    default:
+                        ctok->state = CTokStateComment;
+                        break;
+                }
+                break;
+            case CTokStateBackslash:
+                switch (*c) {
+                    case '\n':
+                        ctok->state = CTokStateStart;
+                        break;
+                    default:
+                        return mark_error(ctok);
+                }
+                break;
+        }
+    }
+found_end_of_macro:
+
+    switch (ctok->state) {
+        case CTokStateStart:
+            break;
+        case CTokStateIdentifier:
+        case CTokStateDecimal:
+        case CTokStateHex:
+        case CTokStateOctal:
+        case CTokStateGotZero:
+        case CTokStateIntSuffix:
+        case CTokStateIntSuffixLong:
+            end_token(ctok);
+            break;
+        case CTokStateFloat:
+        case CTokStateFloatExp:
+            end_float(ctok);
+            break;
+        case CTokStateExpectChar:
+        case CTokStateExpectEndQuot:
+        case CTokStateOpenComment:
+        case CTokStateLineComment:
+        case CTokStateComment:
+        case CTokStateCommentStar:
+        case CTokStateCharEscape:
+        case CTokStateBackslash:
+        case CTokStateString:
+        case CTokStateExpSign:
+        case CTokStateFloatExpFirst:
+            return mark_error(ctok);
+    }
+
+    assert(ctok->cur_tok == nullptr);
+}
src/c_tokenizer.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016 Andrew Kelley
+ *
+ * This file is part of zig, which is MIT licensed.
+ * See http://opensource.org/licenses/MIT
+ */
+
+
+#ifndef ZIG_C_TOKENIZER_HPP
+#define ZIG_C_TOKENIZER_HPP
+
+#include "buffer.hpp"
+
+enum CTokId {
+    CTokIdCharLit,
+    CTokIdStrLit,
+    CTokIdNumLitInt,
+    CTokIdNumLitFloat,
+    CTokIdSymbol,
+    CTokIdMinus,
+};
+
+struct CTok {
+    enum CTokId id;
+    union {
+        uint8_t char_lit;
+        Buf str_lit;
+        uint64_t num_lit_int;
+        double num_lit_float;
+        Buf symbol;
+    } data;
+};
+
+enum CTokState {
+    CTokStateStart,
+    CTokStateExpectChar,
+    CTokStateCharEscape,
+    CTokStateExpectEndQuot,
+    CTokStateOpenComment,
+    CTokStateLineComment,
+    CTokStateComment,
+    CTokStateCommentStar,
+    CTokStateBackslash,
+    CTokStateString,
+    CTokStateIdentifier,
+    CTokStateDecimal,
+    CTokStateOctal,
+    CTokStateGotZero,
+    CTokStateHex,
+    CTokStateIntSuffix,
+    CTokStateIntSuffixLong,
+    CTokStateFloat,
+    CTokStateExpSign,
+    CTokStateFloatExp,
+    CTokStateFloatExpFirst,
+};
+
+struct CTokenize {
+    ZigList<CTok> tokens;
+    CTokState state;
+    bool error;
+    CTok *cur_tok;
+    Buf buf;
+    bool unsigned_suffix;
+    bool long_suffix;
+};
+
+void tokenize_c_macro(CTokenize *ctok, const uint8_t *c);
+
+#endif
src/parseh.cpp
@@ -12,6 +12,7 @@
 #include "parser.hpp"
 #include "all_types.hpp"
 #include "tokenizer.hpp"
+#include "c_tokenizer.hpp"
 #include "analyze.hpp"
 
 #include <clang/Frontend/ASTUnit.h>
@@ -176,6 +177,19 @@ static AstNode *create_str_lit_node(Context *c, Buf *buf) {
     return node;
 }
 
+static AstNode *create_num_lit_float(Context *c, double x) {
+    AstNode *node = create_node(c, NodeTypeNumberLiteral);
+    node->data.number_literal.kind = NumLitFloat;
+    node->data.number_literal.data.x_float = x;
+    return node;
+}
+
+static AstNode *create_num_lit_float_negative(Context *c, double x, bool negative) {
+    AstNode *num_lit_node = create_num_lit_float(c, x);
+    if (!negative) return num_lit_node;
+    return create_prefix_node(c, PrefixOpNegation, num_lit_node);
+}
+
 static AstNode *create_num_lit_unsigned(Context *c, uint64_t x) {
     AstNode *node = create_node(c, NodeTypeNumberLiteral);
     node->data.number_literal.kind = NumLitUInt;
@@ -183,6 +197,12 @@ static AstNode *create_num_lit_unsigned(Context *c, uint64_t x) {
     return node;
 }
 
+static AstNode *create_num_lit_unsigned_negative(Context *c, uint64_t x, bool negative) {
+    AstNode *num_lit_node = create_num_lit_unsigned(c, x);
+    if (!negative) return num_lit_node;
+    return create_prefix_node(c, PrefixOpNegation, num_lit_node);
+}
+
 static AstNode *create_num_lit_signed(Context *c, int64_t x) {
     if (x >= 0) {
         return create_num_lit_unsigned(c, x);
@@ -1244,209 +1264,70 @@ static void render_macros(Context *c) {
     }
 }
 
-static int parse_c_char_lit(Buf *value, uint8_t *out_c) {
-    enum State {
-        StateExpectStartQuot,
-        StateExpectChar,
-        StateExpectEndQuot,
-        StateExpectEnd,
-    };
-    State state = StateExpectStartQuot;
-    for (int i = 0; i < buf_len(value); i += 1) {
-        uint8_t c = buf_ptr(value)[i];
-        switch (state) {
-            case StateExpectStartQuot:
-                switch (c) {
-                    case '\'':
-                        state = StateExpectChar;
-                        break;
-                    default:
-                        return -1;
-                }
-                break;
-            case StateExpectChar:
-                switch (c) {
-                    case '\\':
-                    case '\'':
-                        return -1;
-                    default:
-                        *out_c = c;
-                        state = StateExpectEndQuot;
-                }
-                break;
-            case StateExpectEndQuot:
-                switch (c) {
-                    case '\'':
-                        state = StateExpectEnd;
-                        break;
-                    default:
-                        return -1;
-                }
-                break;
-            case StateExpectEnd:
-                return -1;
-        }
-    }
-    return (state == StateExpectEnd) ? 0 : -1;
-}
-
-static int parse_c_num_lit_unsigned(Buf *buf, uint64_t *out_val) {
-    char *temp;
-    *out_val = strtoull(buf_ptr(buf), &temp, 0);
-
-    if (temp == buf_ptr(buf) || *temp != 0 || *out_val == ULLONG_MAX) {
-        return -1;
-    }
-
-    return 0;
-}
-
-static bool is_simple_symbol(Buf *buf) {
-    bool first = true;
-    for (int i = 0; i < buf_len(buf); i += 1) {
-        uint8_t c = buf_ptr(buf)[i];
-        bool valid_alpha = (c >= 'a' && c <= 'z') ||
-            (c >= 'A' && c <= 'Z') || c == '_';
-        bool valid_digit = (c >= '0' && c <= '9');
-
-        bool ok = (valid_alpha || (!first && valid_digit));
-        first = false;
-
-        if (!ok) {
-            return false;
-        }
-    }
-    return true;
-}
-
-enum ParseCStrState {
-    ParseCStrStateExpectQuot,
-    ParseCStrStateNormal,
-    ParseCStrStateEscape,
-};
-
-static int parse_c_str_lit(Buf *buf, Buf *out_str) {
-    ParseCStrState state = ParseCStrStateExpectQuot;
-    buf_resize(out_str, 0);
-
-    for (int i = 0; i < buf_len(buf); i += 1) {
-        uint8_t c = buf_ptr(buf)[i];
-        switch (state) {
-            case ParseCStrStateExpectQuot:
-                if (c == '"') {
-                    state = ParseCStrStateNormal;
-                } else {
-                    return -1;
-                }
-                break;
-            case ParseCStrStateNormal:
-                switch (c) {
-                    case '\\':
-                        state = ParseCStrStateEscape;
-                        break;
-                    case '\n':
-                        return -1;
-                    case '"':
-                        return 0;
-                    default:
-                        buf_append_char(out_str, c);
-                }
-                break;
-            case ParseCStrStateEscape:
-                switch (c) {
-                    case '\'':
-                        buf_append_char(out_str, '\'');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case '"':
-                        buf_append_char(out_str, '"');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case '?':
-                        buf_append_char(out_str, '\?');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case '\\':
-                        buf_append_char(out_str, '\\');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 'a':
-                        buf_append_char(out_str, '\a');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 'b':
-                        buf_append_char(out_str, '\b');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 'f':
-                        buf_append_char(out_str, '\f');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 'n':
-                        buf_append_char(out_str, '\n');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 'r':
-                        buf_append_char(out_str, '\r');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 't':
-                        buf_append_char(out_str, '\t');
-                        state = ParseCStrStateNormal;
-                        break;
-                    case 'v':
-                        buf_append_char(out_str, '\v');
-                        state = ParseCStrStateNormal;
-                        break;
-                    default:
-                        // TODO octal escape sequence, hexadecimal escape sequence, and
-                        // universal character name
-                        return -1;
-                }
-                break;
-        }
-    }
-
-    return -1;
-}
-
-static void process_macro(Context *c, Buf *name, Buf *value) {
-    //fprintf(stderr, "macro '%s' = '%s'\n", buf_ptr(name), buf_ptr(value));
+static void process_macro(Context *c, CTokenize *ctok, Buf *name, const char *char_ptr) {
     if (is_zig_keyword(name)) {
         return;
     }
 
-    // maybe it's a character literal
-    uint8_t ch;
-    if (!parse_c_char_lit(value, &ch)) {
-        AstNode *var_node = create_var_decl_node(c, buf_ptr(name), create_char_lit_node(c, ch));
-        c->macro_table.put(name, var_node);
-        return;
-    }
-    // maybe it's a string literal
-    Buf str_lit = BUF_INIT;
-    if (!parse_c_str_lit(value, &str_lit)) {
-        AstNode *var_node = create_var_decl_node(c, buf_ptr(name), create_str_lit_node(c, &str_lit));
-        c->macro_table.put(name, var_node);
-        return;
-    }
+    tokenize_c_macro(ctok, (const uint8_t *)char_ptr);
 
-    // maybe it's an unsigned integer
-    uint64_t uint;
-    if (!parse_c_num_lit_unsigned(value, &uint)) {
-        AstNode *var_node = create_var_decl_node(c, buf_ptr(name), create_num_lit_unsigned(c, uint));
-        c->macro_table.put(name, var_node);
+    if (ctok->error) {
         return;
     }
 
-    // maybe it's a symbol
-    if (is_simple_symbol(value)) {
-        // if it equals itself, ignore. for example, from stdio.h:
-        // #define stdin stdin
-        if (buf_eql_buf(name, value)) {
-            return;
+    bool negate = false;
+    for (int i = 0; i < ctok->tokens.length; i += 1) {
+        bool is_first = (i == 0);
+        bool is_last = (i == ctok->tokens.length - 1);
+        CTok *tok = &ctok->tokens.at(i);
+        switch (tok->id) {
+            case CTokIdCharLit:
+                if (is_last && is_first) {
+                    AstNode *var_node = create_var_decl_node(c, buf_ptr(name),
+                            create_char_lit_node(c, tok->data.char_lit));
+                    c->macro_table.put(name, var_node);
+                }
+                return;
+            case CTokIdStrLit:
+                if (is_last && is_first) {
+                    AstNode *var_node = create_var_decl_node(c, buf_ptr(name),
+                            create_str_lit_node(c, &tok->data.str_lit));
+                    c->macro_table.put(name, var_node);
+                }
+                return;
+            case CTokIdNumLitInt:
+                if (is_last) {
+                    AstNode *var_node = create_var_decl_node(c, buf_ptr(name),
+                            create_num_lit_unsigned_negative(c, tok->data.num_lit_int, negate));
+                    c->macro_table.put(name, var_node);
+                }
+                return;
+            case CTokIdNumLitFloat:
+                if (is_last) {
+                    AstNode *var_node = create_var_decl_node(c, buf_ptr(name),
+                            create_num_lit_float_negative(c, tok->data.num_lit_float, negate));
+                    c->macro_table.put(name, var_node);
+                }
+                return;
+            case CTokIdSymbol:
+                if (is_last && is_first) {
+                    // if it equals itself, ignore. for example, from stdio.h:
+                    // #define stdin stdin
+                    Buf *symbol_name = buf_create_from_buf(&tok->data.symbol);
+                    if (buf_eql_buf(name, symbol_name)) {
+                        return;
+                    }
+                    c->macro_symbols.append({name, symbol_name});
+                    return;
+                }
+            case CTokIdMinus:
+                if (is_first) {
+                    negate = true;
+                    break;
+                } else {
+                    return;
+                }
         }
-        c->macro_symbols.append({name, value});
     }
 }
 
@@ -1473,6 +1354,8 @@ static void process_symbol_macros(Context *c) {
 }
 
 static void process_preprocessor_entities(Context *c, ASTUnit &unit) {
+    CTokenize ctok = {{0}};
+
     for (PreprocessedEntity *entity : unit.getLocalPreprocessingEntities()) {
         switch (entity->getKind()) {
             case PreprocessedEntity::InvalidKind:
@@ -1494,16 +1377,7 @@ static void process_preprocessor_entities(Context *c, ASTUnit &unit) {
                     }
 
                     const char *end_c = c->source_manager->getCharacterData(end_loc);
-                    Buf *value = buf_alloc();
-                    while (*end_c && *end_c != '\n') {
-                        buf_append_char(value, *end_c);
-                        if (end_c[0] == '\\' && end_c[1] == '\n') {
-                            end_c += 2;
-                        } else {
-                            end_c += 1;
-                        }
-                    }
-                    process_macro(c, buf_create_from_str(name), value);
+                    process_macro(c, &ctok, buf_create_from_str(name), end_c);
                 }
         }
     }
test/run_tests.cpp
@@ -1390,6 +1390,10 @@ extern void (*fn_ptr)(void);
     add_parseh_case("__cdecl doesn't mess up function pointers", R"SOURCE(
 void foo(void (__cdecl *fn_ptr)(void));
     )SOURCE", 1, "pub extern fn foo(fn_ptr: ?extern fn());");
+
+    add_parseh_case("comment after integer literal", R"SOURCE(
+#define SDL_INIT_VIDEO 0x00000020  /**< SDL_INIT_VIDEO implies SDL_INIT_EVENTS */
+    )SOURCE", 1, "pub const SDL_INIT_VIDEO = 32;");
 }
 
 static void run_self_hosted_test(void) {
CMakeLists.txt
@@ -40,6 +40,7 @@ set(ZIG_SOURCES
     "${CMAKE_SOURCE_DIR}/src/ast_render.cpp"
     "${CMAKE_SOURCE_DIR}/src/bignum.cpp"
     "${CMAKE_SOURCE_DIR}/src/tokenizer.cpp"
+    "${CMAKE_SOURCE_DIR}/src/c_tokenizer.cpp"
     "${CMAKE_SOURCE_DIR}/src/parser.cpp"
     "${CMAKE_SOURCE_DIR}/src/eval.cpp"
     "${CMAKE_SOURCE_DIR}/src/analyze.cpp"