Commit 1b24f4c73c

Andrew Kelley <superjoe30@gmail.com>
2015-11-24 05:30:12
parsing hello.zig example with recursive descent
that was easy
1 parent 6b911f1
src/grammar.txt
@@ -1,57 +0,0 @@
-Root<node> : many(FnDecl) token(EOF) {
-    $$ = ast_create_root($1);
-};
-
-FnDecl<node> : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) token(Arrow) Type Block {
-    $$ = ast_create_fn_decl($2, $4, $7, $8);
-} | token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) Block {
-    $$ = ast_create_void_fn_decl($2, $4, $6);
-};
-
-ParamDecl<node> : token(Symbol) token(Colon) Type {
-    $$ = ast_create_param_decl($1, $2);
-};
-
-Type<node> : token(Symbol) {
-    $$ = ast_create_symbol_type($1);
-} | PointerType {
-    $$ = $1;
-};
-
-PointerType<node> : token(Star) token(Const) Type {
-    $$ = ast_create_pointer_type($2, $3);
-} | token(Star) token(Mut) Type {
-    $$ = ast_create_pointer_type($2, $3);
-};
-
-Block<node> : token(LBrace) many(Statement) Expression token(RBrace) {
-    $$ = ast_create_expr_block($2, $3);
-} | token(LBrace) many(Statement) token(RBrace) {
-    $$ = ast_create_block($2);
-};
-
-Statement<node> : ExpressionStatement {
-    $$ = $1;
-} | ReturnStatement {
-    $$ = $1;
-};
-
-ExpressionStatement<node> : Expression token(Semicolon) {
-    $$ = ast_create_expression_statement($1);
-};
-
-ReturnStatement<node> : token(Return) Expression token(Semicolon) {
-    $$ = ast_create_return_statement($2);
-};
-
-Expression<node> : token(Number) {
-    $$ = ast_create_number($1);
-} | token(String) {
-    $$ = ast_create_string($1);
-} | FnCall {
-    $$ = $1;
-};
-
-FnCall<node> : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) {
-    $$ = ast_create_fn_call($1, $3);
-};
src/main.cpp
@@ -89,6 +89,8 @@ static int build(const char *arg0, const char *in_file, const char *out_file, Zi
 
     AstNode *root = ast_parse(in_data, tokens);
     assert(root);
+    fprintf(stderr, "\nAST:\n");
+    fprintf(stderr, "------\n");
     ast_print(root, 0);
 
 
@@ -135,7 +137,7 @@ int main(int argc, char **argv) {
         } else {
             switch (cmd) {
                 case CmdNone:
-                    zig_panic("unreachable");
+                    zig_unreachable();
                 case CmdBuild:
                     if (!in_file) {
                         in_file = arg;
@@ -154,6 +156,6 @@ int main(int argc, char **argv) {
             return build(arg0, in_file, out_file, &include_paths);
     }
 
-    zig_panic("unreachable");
+    zig_unreachable();
 }
 
src/parser.cpp
@@ -41,7 +41,7 @@ const char *node_type_str(NodeType node_type) {
         case NodeTypeFnCall:
             return "FnCall";
     }
-    zig_panic("unreachable");
+    zig_unreachable();
 }
 
 void ast_print(AstNode *node, int indent) {
@@ -82,28 +82,321 @@ void ast_print(AstNode *node, int indent) {
 struct ParseContext {
     Buf *buf;
     AstNode *root;
+    ZigList<Token> *tokens;
 };
 
-AstNode *ast_create_root(void) {
-    zig_panic("TODO create root");
+static AstNode *ast_create_node(NodeType type) {
+    AstNode *node = allocate<AstNode>(1);
+    node->type = type;
+    return node;
 }
 
-void ast_invalid_token_error(Buf *buf, Token *token) {
+static void ast_buf_from_token(ParseContext *pc, Token *token, Buf *buf) {
+    buf_init_from_mem(buf, buf_ptr(pc->buf) + token->start_pos, token->end_pos - token->start_pos);
+}
+
+static void ast_invalid_token_error(ParseContext *pc, Token *token) {
     Buf token_value = {0};
-    buf_init_from_mem(&token_value, buf_ptr(buf) + token->start_pos, token->end_pos - token->start_pos);
+    ast_buf_from_token(pc, token, &token_value);
     ast_error(token, "invalid token: '%s'", buf_ptr(&token_value));
 }
 
-void ast_parse_fn_decls(ParseContext *pc, ZigList<AstNode *> *fn_decls) {
-    zig_panic("TODO parse fn decls");
+static AstNode *ast_parse_expression(ParseContext *pc, int token_index, int *new_token_index);
+
+
+static void ast_expect_token(ParseContext *pc, Token *token, TokenId token_id) {
+    if (token->id != token_id) {
+        ast_invalid_token_error(pc, token);
+    }
+}
+
+/*
+Type : token(Symbol) | PointerType;
+PointerType : token(Star) token(Const) Type  | token(Star) token(Mut) Type;
+*/
+static AstNode *ast_parse_type(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeType);
+
+    Token *token = &pc->tokens->at(token_index);
+    token_index += 1;
+
+    if (token->id == TokenIdSymbol) {
+        node->data.type.type = AstNodeTypeTypePrimitive;
+        ast_buf_from_token(pc, token, &node->data.type.primitive_name);
+    } else if (token->id == TokenIdStar) {
+        Token *const_or_mut = &pc->tokens->at(token_index);
+        token_index += 1;
+        if (const_or_mut->id == TokenIdKeywordMut) {
+            node->data.type.is_const = false;
+        } else if (const_or_mut->id == TokenIdKeywordConst) {
+            node->data.type.is_const = true;
+        } else {
+            ast_invalid_token_error(pc, const_or_mut);
+        }
+
+        node->data.type.child_type = ast_parse_type(pc, token_index, &token_index);
+    } else {
+        ast_invalid_token_error(pc, token);
+    }
+
+    *new_token_index = token_index;
+    return node;
+}
+
+/*
+ParamDecl<node> : token(Symbol) token(Colon) Type {
+};
+*/
+static AstNode *ast_parse_param_decl(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeParamDecl);
+
+    Token *param_name = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, param_name, TokenIdSymbol);
+
+    ast_buf_from_token(pc, param_name, &node->data.param_decl.name);
+
+    Token *colon = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, colon, TokenIdColon);
+
+    node->data.param_decl.type = ast_parse_type(pc, token_index, &token_index);
+
+    *new_token_index = token_index;
+    return node;
+}
+
+
+static void ast_parse_param_decl_list(ParseContext *pc, int token_index, int *new_token_index,
+        ZigList<AstNode *> *params)
+{
+    Token *l_paren = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, l_paren, TokenIdLParen);
+
+    Token *token = &pc->tokens->at(token_index);
+    if (token->id == TokenIdRParen) {
+        token_index += 1;
+        *new_token_index = token_index;
+        return;
+    }
+
+    for (;;) {
+        AstNode *param_decl_node = ast_parse_param_decl(pc, token_index, &token_index);
+        params->append(param_decl_node);
+
+        Token *token = &pc->tokens->at(token_index);
+        token_index += 1;
+        if (token->id == TokenIdRParen) {
+            *new_token_index = token_index;
+            return;
+        } else {
+            ast_expect_token(pc, token, TokenIdComma);
+        }
+    }
+    zig_unreachable();
+}
+
+static void ast_parse_fn_call_param_list(ParseContext *pc, int token_index, int *new_token_index,
+        ZigList<AstNode*> *params)
+{
+    Token *l_paren = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, l_paren, TokenIdLParen);
+
+    Token *token = &pc->tokens->at(token_index);
+    if (token->id == TokenIdRParen) {
+        token_index += 1;
+        *new_token_index = token_index;
+        return;
+    }
+
+    for (;;) {
+        AstNode *expr = ast_parse_expression(pc, token_index, &token_index);
+        params->append(expr);
+
+        Token *token = &pc->tokens->at(token_index);
+        token_index += 1;
+        if (token->id == TokenIdRParen) {
+            *new_token_index = token_index;
+            return;
+        } else {
+            ast_expect_token(pc, token, TokenIdComma);
+        }
+    }
+    zig_unreachable();
+}
+
+/*
+FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) ;
+*/
+static AstNode *ast_parse_fn_call(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeFnCall);
+
+    Token *fn_name = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, fn_name, TokenIdSymbol);
+
+    ast_buf_from_token(pc, fn_name, &node->data.fn_call.name);
+
+    ast_parse_fn_call_param_list(pc, token_index, &token_index, &node->data.fn_call.params);
+
+    *new_token_index = token_index;
+    return node;
+}
+
+static AstNode *ast_parse_expression(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeExpression);
+
+    Token *token = &pc->tokens->at(token_index);
+    if (token->id == TokenIdSymbol) {
+        node->data.expression.type = AstNodeExpressionTypeFnCall;
+        node->data.expression.data.fn_call = ast_parse_fn_call(pc, token_index, &token_index);
+    } else if (token->id == TokenIdNumberLiteral) {
+        node->data.expression.type = AstNodeExpressionTypeNumber;
+        ast_buf_from_token(pc, token, &node->data.expression.data.number);
+        token_index += 1;
+    } else if (token->id == TokenIdStringLiteral) {
+        node->data.expression.type = AstNodeExpressionTypeString;
+        ast_buf_from_token(pc, token, &node->data.expression.data.string);
+        token_index += 1;
+    } else {
+        ast_invalid_token_error(pc, token);
+    }
+
+    *new_token_index = token_index;
+    return node;
+}
+
+/*
+Statement : ExpressionStatement  | ReturnStatement ;
+
+ExpressionStatement : Expression token(Semicolon) ;
+
+ReturnStatement : token(Return) Expression token(Semicolon) ;
+
+Expression : token(Number)  | token(String)  | FnCall ;
+
+FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) ;
+*/
+static AstNode *ast_parse_statement(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeStatement);
+
+    Token *token = &pc->tokens->at(token_index);
+    if (token->id == TokenIdKeywordReturn) {
+        token_index += 1;
+        node->data.statement.type = AstNodeStatementTypeReturn;
+        node->data.statement.data.retrn.expression = ast_parse_expression(pc, token_index, &token_index);
+
+        Token *semicolon = &pc->tokens->at(token_index);
+        token_index += 1;
+        ast_expect_token(pc, semicolon, TokenIdSemicolon);
+    } else if (token->id == TokenIdSymbol ||
+               token->id == TokenIdStringLiteral ||
+               token->id == TokenIdNumberLiteral)
+    {
+        node->data.statement.type = AstNodeStatementTypeExpression;
+        node->data.statement.data.expr.expression = ast_parse_expression(pc, token_index, &token_index);
+
+        Token *semicolon = &pc->tokens->at(token_index);
+        token_index += 1;
+        ast_expect_token(pc, semicolon, TokenIdSemicolon);
+    } else {
+        ast_invalid_token_error(pc, token);
+    }
+
+    *new_token_index = token_index;
+    return node;
+}
+
+/*
+Block : token(LBrace) many(Statement) token(RBrace);
+*/
+static AstNode *ast_parse_block(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeBlock);
+
+    Token *l_brace = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, l_brace, TokenIdLBrace);
+
+    for (;;) {
+        Token *token = &pc->tokens->at(token_index);
+        if (token->id == TokenIdRBrace) {
+            token_index += 1;
+            *new_token_index = token_index;
+            return node;
+        } else {
+            AstNode *statement_node = ast_parse_statement(pc, token_index, &token_index);
+            node->data.block.statements.append(statement_node);
+        }
+    }
+    zig_unreachable();
+}
+
+/*
+FnDecl : token(Fn) token(Symbol) ParamDeclList option(token(Arrow) Type) Block;
+*/
+static AstNode *ast_parse_fn_decl(ParseContext *pc, int token_index, int *new_token_index) {
+    AstNode *node = ast_create_node(NodeTypeFnDecl);
+
+    Token *fn_token = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, fn_token, TokenIdKeywordFn);
+
+    Token *fn_name = &pc->tokens->at(token_index);
+    token_index += 1;
+    ast_expect_token(pc, fn_name, TokenIdSymbol);
+
+    ast_buf_from_token(pc, fn_name, &node->data.fn_decl.name);
+
+
+    ast_parse_param_decl_list(pc, token_index, &token_index, &node->data.fn_decl.params);
+
+    Token *arrow = &pc->tokens->at(token_index);
+    token_index += 1;
+    if (arrow->id == TokenIdArrow) {
+        node->data.fn_decl.return_type = ast_parse_type(pc, token_index, &token_index);
+    } else if (arrow->id == TokenIdLBrace) {
+        node->data.fn_decl.return_type = nullptr;
+    } else {
+        ast_invalid_token_error(pc, arrow);
+    }
+
+    node->data.fn_decl.body = ast_parse_block(pc, token_index, &token_index);
+
+    *new_token_index = token_index;
+    return node;
+}
+
+
+static void ast_parse_fn_decl_list(ParseContext *pc, int token_index, ZigList<AstNode *> *fn_decls,
+        int *new_token_index)
+{
+    for (;;) {
+        Token *token = &pc->tokens->at(token_index);
+        if (token->id == TokenIdKeywordFn) {
+            AstNode *fn_decl_node = ast_parse_fn_decl(pc, token_index, &token_index);
+            fn_decls->append(fn_decl_node);
+        } else {
+            *new_token_index = token_index;
+            return;
+        }
+    }
+    zig_unreachable();
 }
 
 AstNode *ast_parse(Buf *buf, ZigList<Token> *tokens) {
     ParseContext pc = {0};
     pc.buf = buf;
-    pc.root = ast_create_root();
+    pc.root = ast_create_node(NodeTypeRoot);
+    pc.tokens = tokens;
 
-    ast_parse_fn_decls(&pc, &pc.root->data.root.fn_decls);
+    int new_token_index;
+    ast_parse_fn_decl_list(&pc, 0, &pc.root->data.root.fn_decls, &new_token_index);
+
+    if (new_token_index != tokens->length - 1) {
+        ast_invalid_token_error(&pc, &tokens->at(new_token_index));
+    }
 
     return pc.root;
 }
src/parser.hpp
@@ -44,20 +44,49 @@ enum AstNodeTypeType {
 
 struct AstNodeType {
     AstNodeTypeType type;
-    AstNode *child;
+    Buf primitive_name;
+    AstNode *child_type;
+    bool is_const;
 };
 
-struct AstNodePointerType {
-    AstNode *const_or_mut;
-    AstNode *type;
+struct AstNodeBlock {
+    ZigList<AstNode *> statements;
 };
 
-struct AstNodeBlock {
-    ZigList<AstNode *> expressions;
+enum AstNodeStatementType {
+    AstNodeStatementTypeExpression,
+    AstNodeStatementTypeReturn,
+};
+
+struct AstNodeStatementExpression {
+    AstNode *expression;
+};
+
+struct AstNodeStatementReturn {
+    AstNode *expression;
+};
+
+struct AstNodeStatement {
+    AstNodeStatementType type;
+    union {
+        AstNodeStatementExpression expr;
+        AstNodeStatementReturn retrn;
+    } data;
+};
+
+enum AstNodeExpressionType {
+    AstNodeExpressionTypeNumber,
+    AstNodeExpressionTypeString,
+    AstNodeExpressionTypeFnCall,
 };
 
 struct AstNodeExpression {
-    AstNode *child;
+    AstNodeExpressionType type;
+    union {
+        Buf number;
+        Buf string;
+        AstNode *fn_call;
+    } data;
 };
 
 struct AstNodeFnCall {
@@ -74,13 +103,14 @@ struct AstNode {
         AstNodeType type;
         AstNodeParamDecl param_decl;
         AstNodeBlock block;
+        AstNodeStatement statement;
         AstNodeExpression expression;
         AstNodeFnCall fn_call;
     } data;
 };
 
 __attribute__ ((format (printf, 2, 3)))
-void ast_error(Token *token, const char *format, ...);
+void ast_token_error(Token *token, const char *format, ...);
 void ast_invalid_token_error(Buf *buf, Token *token);
 
 
@@ -91,6 +121,4 @@ const char *node_type_str(NodeType node_type);
 
 void ast_print(AstNode *node, int indent);
 
-AstNode *ast_create_root(void);
-
 #endif
src/tokenizer.hpp
@@ -10,7 +10,6 @@
 
 #include "buffer.hpp"
 
-/*
 enum TokenId {
     TokenIdEof,
     TokenIdSymbol,
@@ -32,30 +31,6 @@ enum TokenId {
     TokenIdArrow,
     TokenIdDash,
 };
-*/
-
-// TODO: debug delete this 
-enum TokenId {
-    TokenIdLParen = 0,
-    TokenIdRParen = 1,
-    TokenIdEof = 2,
-    TokenIdStar = 3,
-    TokenIdPlus = 4,
-    TokenIdSymbol,
-    TokenIdKeywordFn,
-    TokenIdKeywordReturn,
-    TokenIdKeywordMut,
-    TokenIdKeywordConst,
-    TokenIdComma,
-    TokenIdLBrace,
-    TokenIdRBrace,
-    TokenIdStringLiteral,
-    TokenIdSemicolon,
-    TokenIdNumberLiteral,
-    TokenIdColon,
-    TokenIdArrow,
-    TokenIdDash,
-};
 
 struct Token {
     TokenId id;
src/util.hpp
@@ -20,6 +20,12 @@ void zig_panic(const char *format, ...)
     __attribute__ ((noreturn))
     __attribute__ ((format (printf, 1, 2)));
 
+__attribute__((cold))
+__attribute__ ((noreturn))
+static inline void zig_unreachable(void) {
+    zig_panic("unreachable");
+}
+
 template<typename T>
 __attribute__((malloc)) static inline T *allocate_nonzero(size_t count) {
     T *ptr = reinterpret_cast<T*>(malloc(count * sizeof(T)));
README.md
@@ -69,3 +69,31 @@ zig    | C equivalent | Description
   f128 |  long double | 128-bit IEE754 floating point
  isize |      ssize_t |   signed pointer sized integer
  usize |       size_t | unsigned pointer sized integer
+
+### Grammar
+
+```
+Root : many(FnDecl) token(EOF);
+
+FnDecl : token(Fn) token(Symbol) ParamDeclList option(token(Arrow) Type) Block;
+
+ParamDeclList : token(LParen) list(ParamDecl, token(Comma)) token(RParen);
+
+ParamDecl : token(Symbol) token(Colon) Type;
+
+Type : token(Symbol) | PointerType;
+
+PointerType : token(Star) token(Const) Type  | token(Star) token(Mut) Type;
+
+Block : token(LBrace) many(Statement) token(RBrace);
+
+Statement : ExpressionStatement  | ReturnStatement ;
+
+ExpressionStatement : Expression token(Semicolon) ;
+
+ReturnStatement : token(Return) Expression token(Semicolon) ;
+
+Expression : token(Number)  | token(String)  | FnCall ;
+
+FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) ;
+```