/* * Expression parsing routines for the bootstrap compiler. * * Copyright © 2025 Samuel Lidén Borell * * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later */ #include #include #include #include "compiler.h" #include "token.h" static bool is_operand(enum Token t); static void link_call_arg(struct Expr *callexpr, struct Expr *subexpr) { struct CallArg *arg = malloc(sizeof(struct CallArg)); NO_NULL(arg); arg->next = NULL; arg->expr = subexpr; *callexpr->u.call->nextptr = arg; callexpr->u.call->nextptr = &arg->next; } #define OPSTACK_TO_OUT do { \ struct Expr *tmp = opstack; \ opstack = tmp->rpnnext; \ tmp->rpnnext = NULL; \ out_top = tmp; \ *out_nextptr = tmp; \ out_nextptr = &tmp->rpnnext; \ } while (0) struct OpInfo { unsigned precedence : 6; unsigned right_assoc : 1; unsigned no_mixing : 1; unsigned is_addsub : 1; }; #define CALL_PRECEDENCE 6 #define COMPARISON_PRECEDENCE 3 /* Precedence levels: 0: Not an operator 1: Assignment (TODO += -= *= /=) 2: Boolean 3: Comparison 4: Call, access of field 5: + - * / mod (mixing not allowed, except for + and -) */ #define NOMIX 1 #define ADDSUB 1 static const struct OpInfo opinfo[NUM_EXPRKINDS] = { /* E_GROUP_TEMP */ { 0, 0, 0, 0 }, /* E_SEQPOINT */ { 0, 0, 0, 0 }, /* E_NONE */ { 0, 0, 0, 0 }, /* E_FALSE */ { 0, 0, 0, 0 }, /* E_TRUE */ { 0, 0, 0, 0 }, /* E_THIS */ { 0, 0, 0, 0 }, /* E_INTEGER */ { 0, 0, 0, 0 }, /* E_STRING */ { 0, 0, 0, 0 }, /* E_LOCALVAR */ { 0, 0, 0, 0 }, /* E_INSTVAR */ { 0, 0, 0, 0 }, /* E_ARRAY */ { 0, 0, 0, 0 }, /* E_CALL */ { 4, 0, NOMIX, 0 }, /* Unary operators */ /* E_METHODCALL */ { 4, 0, NOMIX, 0 }, /* E_NEGATE is not supported in the bootstrap compiler. Otherwise, it would have been at precedence level 5. */ /* E_BOOL_NOT */{ 2, 0, NOMIX, 0 }, /* Binary operators */ /* E_ADD */ { 5, 0, NOMIX, ADDSUB }, /* E_SUB */ { 5, 0, NOMIX, ADDSUB }, /* E_MUL */ { 5, 0, NOMIX, 0 }, /* E_DIV */ { 5, 0, NOMIX, 0 }, /* E_MOD */ { 5, 0, NOMIX, 0 }, /* E_EQUAL */ { 3, 0, NOMIX, 0 }, /* E_NOT_EQUAL */{ 3, 0, NOMIX, 0 }, /* E_LESS */ { 3, 0, NOMIX, 0 }, /* E_GREATER */ { 3, 0, NOMIX, 0 }, /* E_LESS_EQUAL */ { 3, 0, NOMIX, 0 }, /* E_GREATER_EQUAL */ { 3, 0, NOMIX, 0 }, /* E_BOOL_AND */{ 2, 0, NOMIX, 0 }, /* E_BOOL_OR */ { 2, 0, NOMIX, 0 } }; #undef NOMIX #undef ADDSUB #define IS_CALL(e) ((e)->kind == E_CALL || (e)->kind == E_METHODCALL) /** * Parses an expression into RPN (Reverse Polish Notation). * * The algorithm is based on the "Shunting-yard" algorithm by Edsger Dijkstra. * https://en.wikipedia.org/wiki/Shunting-yard_algorithm */ struct Expr *parse_expr(void) { /** Operator stack */ struct Expr *opstack; /** RPN output */ struct Expr *out, *out_top; struct Expr **out_nextptr; bool operator_expected = false; bool start_of_subexpr = true; int expr_id = 0; /* TODO multi-expression sequences, e.g. [ 1 2 ]: - use return [a b] for multi-return syntax - could extend on the function arg handling to also handle array literals. */ opstack = NULL; out = out_top = NULL; out_nextptr = &out; for (;;) { struct LexemeInfo li; enum Token tok = tokenize(&li); struct Expr *e; switch ((int)tok) { case T_EOL: if (opstack && IS_CALL(opstack)) { /* End of function call at EOL */ operator_expected = true; } else if (!operator_expected) { error(start_of_subexpr ? "Expected expression before end of line" : "Expected operand before end of line"); } goto finished; } e = malloc(sizeof(struct Expr)); NO_NULL(e); assert(expr_id < INT_MAX); e->id = expr_id++; e->rpnnext = NULL; e->typeref = NULL; /* TODO set/link e->tr? */ if (!operator_expected) { switch ((int)tok) { /* Terminal expressions */ case T_KW_false: e->kind = E_FALSE; break; case T_KW_true: e->kind = E_TRUE; break; case T_KW_none: e->kind = E_NONE; break; case T_KW_this: if (!current_type) { error("Can't use `this` keyword outside of a class"); } e->kind = E_THIS; break; case T_Integer: e->kind = E_INTEGER; e->u.intval.num = li.num; break; case T_String: e->kind = E_STRING; /* TODO long strings: maybe only allow long strings at the end of an expr then it could be fairly simple: string s = """ """ but how about indentation? perhaps the least indented line could decide? perhaps only allow indents in steps of 4? */ e->u.strval = new_string_literal(li.string, li.len); break; case T_LowerIdent: { struct ExprCall *call; if (current_func) { struct Var *var = lookup_local_var(li.string, li.len); if (var) { e->kind = E_LOCALVAR; e->u.ident.namelen = 0; e->u.ident.u.var = var; break; } if (current_type) { var = lookup_instance_var(li.string, li.len); if (var) { e->kind = E_INSTVAR; e->u.ident.namelen = 0; e->u.ident.u.var = var; break; } } } /* Assume it's a function call */ call = malloc(sizeof(struct ExprCall)); NO_NULL(call); call->ident.namelen = li.len; call->ident.u.name = dupmemz(li.string, li.len); call->args = NULL; call->nextptr = &call->args; e->kind = E_CALL; e->u.call = call; e->rpnnext = opstack; opstack = e; goto token_processed; } #if 0 /* TODO Typeidents. Note that there would be an ambiguity if dot was used: a .x y a.x y but colon (or something else) should work: a :x y a.x y maybe all missing identifiers could be looked up as typeidents? that would remove the need for a symbol at all (but it might be harder to understand the code): a x y a.x y In the case above, x can ONLY be either a local variable or a typeident, but consider the following case: r = x In that case, x could be a local variable, a method or a typeident (enum value or constructor). */ case T_SYM_Colon: e->kind = E_TYPEIDENT; expect(&li, T_LowerIdent, "Expected identifier after `:`"); /* FIXME actually, (at least in theory) it can be followed by either an operator (for e.g. .enum_value or .const) or an operand (for e.g. .constructor 1 2 3). BUT operators aren't really applicable to enum values (except maybe for members, BUT then there's no typescope anyway) */ ... break; #endif /* Unary prefix operators */ case T_SYM_Minus: if (opstack && IS_CALL(opstack)) { error("Negation in parameter must be written as (-x)"); } error("Signed numbers are not supported in the " "bootstrap compiler"); break; case T_KW_not: e->kind = E_BOOL_NOT; e->rpnnext = opstack; opstack = e; goto token_processed; /* Grouping symbols */ case T_SYM_LBracket: if (operator_expected) { error("Unexpected `[`. Arrays indexing does not need []"); } e->u.grouptemp = GK_ARRAY_LITERAL; start_of_subexpr = false; goto grouping_start; case T_SYM_LParen: e->u.grouptemp = GK_GROUPING_PAREN; start_of_subexpr = true; grouping_start: /* Argument and element list (exprlist) parsing. exprlists are stored like this on the out stack: value1 value2 value3 ( TERM TERM TERM CALL */ e->kind = E_GROUP_TEMP; /* Move operators with higher precedence to output stack */ while (opstack) { struct OpInfo st = opinfo[opstack->kind]; if (st.precedence <= CALL_PRECEDENCE) break; OPSTACK_TO_OUT; } e->rpnnext = opstack; opstack = e; operator_expected = false; continue; /* don't clear start_of_subexpr */ case T_SYM_RBracket: case T_SYM_RParen: { bool popped_ops, found; struct Expr *popped; grouping_end: popped_ops = false; found = false; if (opstack && IS_CALL(opstack)) { /* For example, the end of `(f x y)` */ assert(!operator_expected); operator_expected = true; } /* Pop everything off the stack until a ( or [ is found */ while (opstack) { if (opstack->kind == E_GROUP_TEMP) { found = true; break; } OPSTACK_TO_OUT; popped_ops = true; } if (!found) { error(tok == T_SYM_RBracket ? "Too many `]`" : "Too many `)`"); } if (!operator_expected && popped_ops) { /* E.g. (1 + ) */ error(tok == T_SYM_RBracket ? "Missing operand before `]`" : "Missing operand before `)`"); } /* Pop the grouping ( or [ */ assert(opstack->kind == E_GROUP_TEMP); if (tok == T_SYM_RBracket && opstack->u.grouptemp == GK_GROUPING_PAREN) { error("Expected `)` not `]` here"); } else if (tok == T_SYM_RParen && opstack->u.grouptemp == GK_ARRAY_LITERAL) { error("Expected `]` not `)` after array literal"); } popped = opstack; opstack = opstack->rpnnext; free(popped); if (tok == T_SYM_RBracket) { /* Array literal */ e->kind = E_ARRAY; /* TODO link elements to array? */ } else { /* Grouping parenthesis. It's not added to the output */ assert(tok == T_SYM_RParen); free(e); goto dont_add_operand_to_output; } break; } default: error("Expected an operand"); } expr_to_outstack: assert(e->kind != E_GROUP_TEMP); out_top = e; *out_nextptr = e; out_nextptr = &e->rpnnext; dont_add_operand_to_output: if (opstack && IS_CALL(opstack)) { link_call_arg(opstack, out_top); operator_expected = false; } else { operator_expected = true; } } else { bool push_seqpoint = false; /* Operator_expected */ /* TODO operators should not be allowed in all contexts (but always inside a parenthesis) */ switch ((int)tok) { case T_SYM_SingleEqual: if (opstack) { error("Assignment inside expression"); } /* Assignment is a statement */ free(e); unread_token(); goto finished; case T_SYM_DoubleEqual: e->kind = E_EQUAL; break; case T_SYM_NotEqual: e->kind = E_NOT_EQUAL; break; case T_SYM_Less: e->kind = E_LESS; break; case T_SYM_Greater: e->kind = E_GREATER; break; case T_SYM_LessEqual: e->kind = E_LESS_EQUAL; break; case T_SYM_GreaterEqual: e->kind = E_GREATER_EQUAL; break; case T_SYM_Plus: e->kind = E_ADD; break; case T_SYM_Minus: /* Negation is handled above, with operator_expected==0 */ e->kind = E_SUB; break; case T_SYM_Asterisk: e->kind = E_MUL; break; case T_SYM_Slash: e->kind = E_DIV; break; case T_KW_and: e->kind = E_BOOL_AND; /* TODO varstates (e.g. `if o <> none and o.is_xyz` ) */ push_seqpoint = true; break; case T_KW_mod: e->kind = E_MOD; break; case T_KW_not: error("Binary operator expected here"); case T_KW_or: e->kind = E_BOOL_OR; /* TODO varstates (e.g. `if o == none or o.is_xyz` ) */ push_seqpoint = true; break; case T_SYM_LBracket: error("Unexpected `[`. Array indexing does not need []"); case T_SYM_LParen: error("Unexpected `(`. Function calls do not need ()"); case T_SYM_RBracket: case T_SYM_RParen: goto grouping_end; case T_SYM_Dot: { /* This can be either a field access or a method call. But field accesses are syntactically identical to zero-argument method calls. */ struct ExprMethodCall *m; e->kind = E_METHODCALL; expect(&li, T_LowerIdent, "Expected identifier after `.`"); m = malloc(sizeof(struct ExprMethodCall)); NO_NULL(m); m->call.ident.namelen = li.len; m->call.ident.u.name = dupmemz(li.string, li.len); m->call.args = NULL; m->call.nextptr = &m->call.args; e->u.mcall = m; /* XXX Check handling of ambiguities. It should only be triggered at the start of subexprs, to handle this ambiguitiy: do_stuff a.b c This should be parsed as (do_stuff (a.b) (c)) TODO Related: These should work (neither works at the moment) y == obj.f x obj.f x == y */ assert(out_top != NULL); e->u.mcall->object = out_top; /* last subexpression */ if (!opstack || opstack->kind == E_GROUP_TEMP) { if (is_operand(lookahead_token())) { /* Arguments follow */ e->rpnnext = opstack; opstack = e; operator_expected = false; goto token_processed; } } /* Field. Treat it as a terminal symbol */ goto expr_to_outstack; } /* TODO multi-expr lines, e.g. `1 2` - check mode - check top of operator stack */ default: error("Expected an operator"); } { struct OpInfo op = opinfo[e->kind]; /* TODO disallow mixing of operators precedences! (may also for * /, since order matters for integer rounding and for overflow)*/ while (opstack) { struct OpInfo st = opinfo[opstack->kind]; if (!st.precedence) break; if (op.precedence == st.precedence) { if (op.no_mixing && opstack->kind != e->kind && !(op.is_addsub && st.is_addsub) && op.precedence != COMPARISON_PRECEDENCE) { error("Ambiguous mix of operators"); } else if (st.right_assoc) { break; } } else if (op.precedence > st.precedence) { break; } OPSTACK_TO_OUT; } } assert(out_top != NULL); e->u.binary.left_id = out_top->id; e->rpnnext = opstack; opstack = e; if (push_seqpoint) { /* for boolean and/or */ struct Expr *seqp = malloc(sizeof(struct Expr)); NO_NULL(seqp); seqp->id = e->id; seqp->rpnnext = NULL; seqp->typeref = NULL; seqp->kind = E_SEQPOINT; seqp->u.seqpoint_end = e; out_top = seqp; *out_nextptr = seqp; out_nextptr = &seqp->rpnnext; } operator_expected = false; } token_processed: start_of_subexpr = false; } finished: /* Move remaining tokens on the operator stack to the output stack */ while (opstack) { if (opstack->kind == E_GROUP_TEMP) { error(opstack->u.grouptemp == GK_ARRAY_LITERAL ? "Missing `]`" : "Missing `)`"); } OPSTACK_TO_OUT; } if (!out || !operator_expected) { error("Incomplete expression"); } return out; } static bool is_operand(enum Token t) { switch ((int)t) { /* Terminals */ case T_KW_none: case T_KW_false: case T_KW_true: case T_KW_this: case T_Integer: case T_String: case T_LowerIdent: /* Grouping symbols */ case T_SYM_LBracket: case T_SYM_LParen: return true; default: ; } return false; }