Commit ae0e9685c6
Changed files (3)
src/grammar.txt
@@ -1,46 +1,50 @@
-Root : many(FnDecl) token(EOF) {
+Root<node> : many(FnDecl) token(EOF) {
$$ = ast_create_root($1);
};
-FnDecl : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(token(Arrow) Type) Block {
+FnDecl<node> : token(Fn) token(Symbol) token(LParen) list(ParamDecl, token(Comma)) token(RParen) option(ReturnType) Block {
$$ = ast_create_fn_decl($2, $4, $6, $7);
};
-ParamDecl : token(Symbol) token(Colon) Type {
+ReturnType<node> : token(Arrow) Type {
+ $$ = $2;
+};
+
+ParamDecl<node> : token(Symbol) token(Colon) Type {
$$ = ast_create_param_decl($1, $2);
};
-Type : token(Symbol) {
+Type<node> : token(Symbol) {
$$ = ast_create_symbol_type($1);
} | PointerType {
$$ = $1;
};
-PointerType : token(Star) token(Const) Type {
+PointerType<node> : token(Star) token(Const) Type {
$$ = ast_create_pointer_type($2, $3);
} | token(Star) token(Mut) Type {
$$ = ast_create_pointer_type($2, $3);
};
-Block : token(LBrace) many(Statement) option(Expression) token(RBrace) {
+Block<node> : token(LBrace) many(Statement) option(Expression) token(RBrace) {
$$ = ast_create_block($2, $3);
};
-Statement : ExpressionStatement {
+Statement<node> : ExpressionStatement {
$$ = $1;
} | ReturnStatement {
$$ = $1;
};
-ExpressionStatement : Expression token(Semicolon) {
+ExpressionStatement<node> : Expression token(Semicolon) {
$$ = ast_create_expression_statement($1);
};
-ReturnStatement : token(Return) Expression token(Semicolon) {
+ReturnStatement<node> : token(Return) Expression token(Semicolon) {
$$ = ast_create_return_statement($2);
};
-Expression : token(Number) {
+Expression<node> : token(Number) {
$$ = ast_create_number($1);
} | token(String) {
$$ = ast_create_string($1);
@@ -48,6 +52,6 @@ Expression : token(Number) {
$$ = $1;
};
-FnCall : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) {
+FnCall<node> : token(Symbol) token(LParen) list(Expression, token(Comma)) token(RParen) {
$$ = ast_create_fn_call($1, $3);
};
src/parsergen.cpp
@@ -136,6 +136,7 @@ struct RuleTuple {
Buf name;
ZigList<RuleNode *> children;
Buf body;
+ Buf union_field_name;
};
struct RuleMany {
@@ -161,6 +162,9 @@ struct RuleList {
struct RuleSubRule {
RuleNode *child;
+
+ // for lexer use only
+ Buf name;
};
enum RuleNodeType {
@@ -175,6 +179,8 @@ enum RuleNodeType {
struct RuleNode {
RuleNodeType type;
+ int lex_line;
+ int lex_column;
union {
RuleTuple tuple;
RuleMany many;
@@ -205,6 +211,7 @@ struct CodeGenCapture {
Buf *body;
bool is_root;
Buf *field_names;
+ Buf *union_field_name;
};
struct CodeGen {
@@ -225,6 +232,8 @@ struct ParserState {
enum LexState {
LexStateStart,
LexStateRuleName,
+ LexStateRuleFieldNameStart,
+ LexStateRuleFieldName,
LexStateWaitForColon,
LexStateTupleRule,
LexStateFnName,
@@ -232,6 +241,7 @@ enum LexState {
LexStateToken,
LexStateBody,
LexStateEndOrOr,
+ LexStateSubTupleName,
};
struct LexStack {
@@ -258,6 +268,8 @@ struct Gen {
int lex_token_name_begin;
int lex_body_begin;
int lex_body_end;
+ int lex_sub_tuple_begin;
+ int lex_field_name_begin;
};
static ParserState *create_state(Gen *g) {
@@ -303,12 +315,13 @@ static void state_add_push_node(ParserState *state) {
state_add_code(state, code);
}
-static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count) {
+static CodeGen *codegen_create_capture(Buf *body, bool is_root, int field_name_count, Buf *union_field_name) {
CodeGen *code = allocate<CodeGen>(1);
code->type = CodeGenTypeCapture;
code->capture.body = body;
code->capture.is_root = is_root;
code->capture.field_names = allocate<Buf>(field_name_count);
+ code->capture.union_field_name = union_field_name;
return code;
}
@@ -325,6 +338,7 @@ static void state_add_eat_token(ParserState *state) {
}
static void gen(Gen *g, RuleNode *node, Buf *out_field_name) {
+ assert(node);
switch (node->type) {
case RuleNodeTypeToken:
{
@@ -346,13 +360,14 @@ static void gen(Gen *g, RuleNode *node, Buf *out_field_name) {
break;
case RuleNodeTypeTuple:
{
- buf_init_from_str(out_field_name, "node");
+ buf_init_from_buf(out_field_name, &node->tuple.union_field_name);
state_add_push_node(g->cur_state);
bool is_root = (node == g->root);
int field_name_count = node->tuple.children.length;
- CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count);
+ CodeGen *code = codegen_create_capture(&node->tuple.body, is_root, field_name_count,
+ &node->tuple.union_field_name);
for (int i = 0; i < node->tuple.children.length; i += 1) {
RuleNode *child = node->tuple.children.at(i);
@@ -411,7 +426,7 @@ static void lex_error(Gen *g, const char *format, ...) {
va_list ap;
va_start(ap, format);
- fprintf(stderr, "Error: Line %d, column %d: ", line, column);
+ fprintf(stderr, "Grammar Error: Line %d, column %d: ", line, column);
vfprintf(stderr, format, ap);
fprintf(stderr, "\n");
va_end(ap);
@@ -428,10 +443,16 @@ static void lex_pop_stack(Gen *g) {
g->lex_stack.pop();
}
+static RuleNode *create_rule_node(Gen *g) {
+ RuleNode *node = allocate<RuleNode>(1);
+ node->lex_line = g->lex_line;
+ node->lex_column = g->lex_column;
+ return node;
+}
static void begin_rule(Gen *g) {
assert(!g->lex_cur_rule);
- g->lex_cur_rule = allocate<RuleNode>(1);
+ g->lex_cur_rule = create_rule_node(g);
g->lex_cur_rule->type = RuleNodeTypeTuple;
g->lex_cur_rule_begin = g->lex_pos;
@@ -452,6 +473,18 @@ static void end_rule_name(Gen *g) {
buf_init_from_mem(&g->lex_cur_rule->tuple.name, ptr, len);
}
+static void begin_rule_field_name(Gen *g) {
+ assert(g->lex_cur_rule);
+ g->lex_field_name_begin = g->lex_pos;
+}
+
+static void end_rule_field_name(Gen *g) {
+ assert(g->lex_cur_rule);
+ char *ptr = &buf_ptr(g->in_buf)[g->lex_field_name_begin];
+ int len = g->lex_pos - g->lex_field_name_begin;
+ buf_init_from_mem(&g->lex_cur_rule->tuple.union_field_name, ptr, len);
+}
+
static void begin_fn_name(Gen *g) {
g->lex_fn_name_begin = g->lex_pos;
lex_push_stack(g);
@@ -478,15 +511,13 @@ static void end_token_name(Gen *g) {
buf_init_from_mem(&token_name, ptr, len);
Token *token = find_or_create_token(g, &token_name);
- RuleNode *node = allocate<RuleNode>(1);
+ RuleNode *node = create_rule_node(g);
node->type = RuleNodeTypeToken;
node->token.token = token;
assert(g->lex_cur_rule->type == RuleNodeTypeTuple);
g->lex_cur_rule->tuple.children.append(node);
- g->biggest_tuple_len = max(g->biggest_tuple_len, g->lex_cur_rule->tuple.children.length);
-
lex_pop_stack(g);
}
@@ -504,6 +535,36 @@ static void end_tuple_body(Gen *g) {
buf_init_from_mem(&g->lex_cur_rule->tuple.body, ptr, len);
}
+static void begin_sub_tuple(Gen *g) {
+ g->lex_sub_tuple_begin = g->lex_pos;
+ lex_push_stack(g);
+}
+
+static void end_sub_tuple(Gen *g) {
+ assert(g->lex_cur_rule->type == RuleNodeTypeTuple);
+ char *ptr = &buf_ptr(g->in_buf)[g->lex_sub_tuple_begin];
+ int len = g->lex_pos - g->lex_sub_tuple_begin;
+
+ RuleNode *node = create_rule_node(g);
+ node->type = RuleNodeTypeSubRule;
+ buf_init_from_mem(&node->sub_rule.name, ptr, len);
+
+ g->lex_cur_rule->tuple.children.append(node);
+
+ lex_pop_stack(g);
+}
+
+static RuleNode *find_rule_node(Gen *g, Buf *name) {
+ for (int i = 0; i < g->rules.length; i += 1) {
+ RuleNode *node = g->rules.at(i);
+ assert(node->type == RuleNodeTypeTuple);
+ if (buf_eql_buf(&node->tuple.name, name)) {
+ return node;
+ }
+ }
+ return nullptr;
+}
+
static void initialize_rules(Gen *g) {
g->lex_state = LexStateStart;
for (g->lex_pos = 0; g->lex_pos < buf_len(g->in_buf); g->lex_pos += 1) {
@@ -524,18 +585,36 @@ static void initialize_rules(Gen *g) {
break;
case LexStateRuleName:
switch (c) {
- case WHITESPACE:
+ case '<':
end_rule_name(g);
- g->lex_state = LexStateWaitForColon;
+ g->lex_state = LexStateRuleFieldNameStart;
break;
- case ':':
- end_rule_name(g);
- g->lex_state = LexStateTupleRule;
+ case SYMBOL_CHAR:
+ // ok
break;
+ default:
+ lex_error(g, "expected '<', not '%c'", c);
+ }
+ break;
+ case LexStateRuleFieldNameStart:
+ switch (c) {
case SYMBOL_CHAR:
+ begin_rule_field_name(g);
+ g->lex_state = LexStateRuleFieldName;
break;
default:
- lex_error(g, "invalid char: '%c'", c);
+ lex_error(g, "expected field name, not '%c'", c);
+ }
+ break;
+ case LexStateRuleFieldName:
+ switch (c) {
+ case SYMBOL_CHAR:
+ // ok
+ break;
+ case '>':
+ end_rule_field_name(g);
+ g->lex_state = LexStateWaitForColon;
+ break;
}
break;
case LexStateWaitForColon:
@@ -559,12 +638,16 @@ static void initialize_rules(Gen *g) {
begin_fn_name(g);
g->lex_state = LexStateFnName;
break;
+ case UPPER_ALPHA:
+ begin_sub_tuple(g);
+ g->lex_state = LexStateSubTupleName;
+ break;
case '{':
begin_tuple_body(g);
g->lex_state = LexStateBody;
break;
default:
- lex_error(g, "invalid char: '%c'", c);
+ lex_error(g, "expected rule, not '%c'", c);
}
break;
case LexStateFnName:
@@ -589,7 +672,7 @@ static void initialize_rules(Gen *g) {
g->lex_state = LexStateToken;
break;
default:
- lex_error(g, "invalid char '%c'", c);
+ lex_error(g, "expected token name, not '%c'", c);
}
break;
case LexStateToken:
@@ -601,7 +684,7 @@ static void initialize_rules(Gen *g) {
end_token_name(g);
break;
default:
- lex_error(g, "invalid char '%c'", c);
+ lex_error(g, "expected token name or ')', not '%c'", c);
}
break;
case LexStateBody:
@@ -627,6 +710,20 @@ static void initialize_rules(Gen *g) {
default:
lex_error(g, "expected ';' or '|'");
}
+ break;
+ case LexStateSubTupleName:
+ switch (c) {
+ case ALPHA:
+ // ignore
+ break;
+ case WHITESPACE:
+ end_sub_tuple(g);
+ assert(g->lex_state == LexStateTupleRule);
+ break;
+ default:
+ lex_error(g, "expected rule name, not '%c'", c);
+ }
+ break;
}
if (c == '\n') {
g->lex_line += 1;
@@ -647,9 +744,40 @@ static void initialize_rules(Gen *g) {
case LexStateTokenStart:
case LexStateToken:
case LexStateBody:
+ case LexStateSubTupleName:
+ case LexStateRuleFieldNameStart:
+ case LexStateRuleFieldName:
lex_error(g, "unexpected EOF");
break;
}
+
+ // Resolve child references into pointers
+ for (int tuple_i = 0; tuple_i < g->rules.length; tuple_i += 1) {
+ RuleNode *node = g->rules.at(tuple_i);
+ assert(node->type == RuleNodeTypeTuple);
+
+ for (int child_i = 0; child_i < node->tuple.children.length; child_i += 1) {
+ RuleNode *child = node->tuple.children.at(child_i);
+ if (child->type == RuleNodeTypeSubRule) {
+ int line = child->lex_line + 1;
+ int column = child->lex_column + 1;
+ RuleNode *referenced_node = find_rule_node(g, &child->sub_rule.name);
+ if (!referenced_node) {
+ fprintf(stderr, "Grammar Error: Line %d, column %d: Rule not defined: '%s'\n",
+ line, column, buf_ptr(&child->sub_rule.name));
+ }
+ child->sub_rule.child = referenced_node;
+ }
+ }
+ }
+
+
+ // calculate the biggest tuple len
+ for (int i = 0; i < g->rules.length; i += 1) {
+ RuleNode *node = g->rules.at(i);
+ assert(node->type == RuleNodeTypeTuple);
+ g->biggest_tuple_len = max(g->biggest_tuple_len, node->tuple.children.length);
+ }
}
enum TemplateState {
@@ -828,6 +956,8 @@ int main(int argc, char **argv) {
fprintf(out_f, " state = transition[%d][token->id];\n", state->index);
break;
case CodeGenTypeError:
+ fprintf(out_f, " token_index -= 1;\n");
+ fprintf(out_f, " token = &tokens->at(token_index);\n");
fprintf(out_f, " ast_error(token, \"%s\");\n", buf_ptr(code->error.msg));
break;
case CodeGenTypeSave:
@@ -843,7 +973,12 @@ int main(int argc, char **argv) {
fprintf(out_f, "%s\n", buf_ptr(code_text));
fprintf(out_f, " return root;\n");
} else {
- zig_panic("TODO capture non-root");
+ fprintf(out_f, " ParserGenNode *parent_node = stack.at(stack.length - 2);\n");
+ Buf *dest = buf_sprintf("parent_node->data[parent_node->next_index++].%s",
+ buf_ptr(code->capture.union_field_name));
+ Buf *code_text = fill_template(code->capture.body, buf_ptr(dest),
+ code->capture.field_names);
+ fprintf(out_f, "%s\n", buf_ptr(code_text));
}
break;
case CodeGenTypePopNode:
src/tokenizer.hpp
@@ -36,15 +36,15 @@ enum TokenId {
// TODO: debug delete this
enum TokenId {
- TokenIdStar = 0,
- TokenIdLParen = 1,
+ TokenIdLParen = 0,
+ TokenIdRParen = 1,
TokenIdEof = 2,
+ TokenIdStar = 3,
TokenIdSymbol,
TokenIdKeywordFn,
TokenIdKeywordReturn,
TokenIdKeywordMut,
TokenIdKeywordConst,
- TokenIdRParen,
TokenIdComma,
TokenIdLBrace,
TokenIdRBrace,