/* * Expression parsing routines for the bootstrap compiler. * * Copyright © 2025 Samuel Lidén Borell * * SPDX-License-Identifier: EUPL-1.2+ */ #include #include #include "compiler.h" #include "token.h" struct ExprString *string_constants = NULL; struct ExprString *empty_string = NULL; unsigned next_string_const_id = 1; static bool is_value_start_token(enum Token tok) { switch ((int)tok) { TOKEN_CASES_VALUE_START return true; default: return false; } } static void link_call_arg(struct Expr *callexpr, struct Expr *subexpr) { struct CallArg *arg = malloc(sizeof(struct CallArg)); NO_NULL(arg); arg->next = NULL; arg->expr = subexpr; *callexpr->u.call->nextptr = arg; callexpr->u.call->nextptr = &arg->next; } #define OPSTACK_TO_OUT do { \ struct Expr *tmp = opstack; \ opstack = tmp->rpnnext; \ tmp->rpnnext = NULL; \ out_top = tmp; \ *out_nextptr = tmp; \ out_nextptr = &tmp->rpnnext; \ } while (0) struct OpInfo { unsigned precedence : 6; unsigned right_assoc : 1; unsigned no_mixing : 1; unsigned is_addsub : 1; }; #define CALL_PRECEDENCE 6 #define COMPARISON_PRECEDENCE 3 /* Precedence levels: 0: Not an operator 1: Assignment (TODO += -= *= /=) 2: Boolean 3: Comparison 4: + - * / mod (mixing not allowed, except for + and -) 5: Negation 6: Call, access of field, access of optional value */ #define RIGHT 1 #define NOMIX 1 #define ADDSUB 1 static const struct OpInfo opinfo[NUM_EXPRKINDS] = { /* E_GROUP_TEMP */ { 0, 0, 0, 0 }, /* E_SEQPOINT */ { 0, 0, 0, 0 }, /* E_NONE */ { 0, 0, 0, 0 }, /* E_FALSE */ { 0, 0, 0, 0 }, /* E_TRUE */ { 0, 0, 0, 0 }, /* E_INTEGER */ { 0, 0, 0, 0 }, /* E_STRING */ { 0, 0, 0, 0 }, /* E_IDENT */ { 0, 0, 0, 0 }, /* E_MEMBER */ { 0, 0, 0, 0 }, /* E_ARRAY */ { 0, 0, 0, 0 }, /* E_CALL */ { 0, 0, 0, 0 }, /* Unary operators */ /* E_NEGATE */ { 5, 0, NOMIX, 0 }, /* E_BOOL_NOT */{ 2, 0, NOMIX, 0 }, /* Binary operators */ /* E_ADD */ { 4, 0, NOMIX, ADDSUB }, /* E_SUB */ { 4, 0, NOMIX, ADDSUB }, /* E_MUL */ { 4, 0, NOMIX, 0 }, /* E_DIV */ { 4, 0, NOMIX, 0 }, /* E_MOD */ { 4, 0, NOMIX, 0 }, /* E_EQUAL */ { 3, 0, NOMIX, 0 }, /* E_NOT_EQUAL */{ 3, 0, NOMIX, 0 }, /* E_LESS */ { 3, 0, NOMIX, 0 }, /* E_GREATER */ { 3, 0, NOMIX, 0 }, /* E_LESS_EQUAL */ { 3, 0, NOMIX, 0 }, /* E_GREATER_EQUAL */ { 3, 0, NOMIX, 0 }, /* E_BOOL_AND */{ 2, 0, NOMIX, 0 }, /* E_BOOL_OR */ { 2, 0, NOMIX, 0 }, /* E_ASSIGN */ { 1, RIGHT, 0, 0 }, /* TODO l-value. will need special handling */ /* E_ASSIGN_FINAL */{ 1, RIGHT, 0, 0 } /* TODO l-value. will need special handling */ }; #undef RIGHT #undef NOMIX #undef ADDSUB /** * Parses an expression into RPN (Reverse Polish Notation). * * The algorithm is based on the "Shunting-yard" algorithm by Edsger Dijkstra. * https://en.wikipedia.org/wiki/Shunting-yard_algorithm */ struct Expr *parse_expr(void) { /** Operator stack */ struct Expr *opstack; /** RPN output */ struct Expr *out, *out_top; struct Expr **out_nextptr; bool operator_expected = false; bool start_of_subexpr = true; int expr_id = 0; /* TODO multi-expression sequences, e.g. [ 1 2 ]: - use return [a b] for multi-return syntax - could extend on the function arg handling to also handle array literals. XXX are 0-arg function calls needed? - perhaps "f obj a b c" syntax could be used for methods, in which case 0-arg calls would be "f obj" - 0-arg calls could also be e.g. "f []", "f void" or "f -" or "f :" - or have different naming conventions (or sigills) for: - functions/methods - fields / instance variables - local variables - (and already: Types) in a "multi-expr" context, operators are forbidden. (such exprs must go inside parentheses) in a non-"multi-expr" context, repeated non-operators are forbidden. */ opstack = NULL; out = out_top = NULL; out_nextptr = &out; for (;;) { struct LexemeInfo li; enum Token tok = tokenize(&li); struct Expr *e; switch ((int)tok) { case T_EOL: if (opstack && opstack->kind == E_CALL) { /* End of function call at EOL */ operator_expected = true; } else if (!operator_expected) { error(start_of_subexpr ? "Expected expression before end of line" : "Expected operand before end of line"); } goto finished; } e = malloc(sizeof(struct Expr)); NO_NULL(e); e->id = expr_id++; e->rpnnext = NULL; e->typeref = NULL; /* TODO set/link e->tr? */ if (!operator_expected) { switch ((int)tok) { /* Terminal expressions */ case T_KW_false: e->kind = E_FALSE; break; case T_KW_true: e->kind = E_TRUE; break; case T_KW_none: e->kind = E_NONE; break; case T_Integer: e->kind = E_INTEGER; e->u.intval.num = li.num; break; case T_String: { struct ExprString *str; e->kind = E_STRING; /* TODO long strings: maybe only allow long strings at the end of an expr then it could be fairly simple: string s = """ """ but how about indentation? perhaps the least indented line could decide? perhaps only allow indents in steps of 4? */ if (li.len == 0 && empty_string) { str = empty_string; } else { str = malloc(sizeof(struct ExprString)); NO_NULL(str); str->next = string_constants; str->len = li.len; /* TODO unescape string */ str->s = (li.len != 0 ? memzdup(li.string, li.len) : NULL); str->id = next_string_const_id++; string_constants = str; if (li.len == 0) { assert(empty_string == NULL); empty_string = str; } } e->u.strval = str; break; } case T_LowerIdent: { const char *name; size_t len; /* XXX should `f x` be allowed if `f` is a local variable? - that allows function refs - that might allow some kind of functional programming? But it also leads to an additional copy. Related: How should ident lookups work? - local idents are easy (could better be looked up here) - method idents are trickier: - just store string, and bind late in `out*.c`? - tree of idents, with trees of types inside? */ len = li.len; assert(len != 0); name = memzdup(li.string, len); /* At the start of subexpression, an identifier followed by a value (not an operator, end of expression, right paren, etc) means a function call. */ if (start_of_subexpr) { tok = tokenize(&li); unread_token(); if (is_value_start_token(tok)) { struct ExprCall *call; /* Function call */ call = malloc(sizeof(struct ExprCall)); NO_NULL(call); call->ident.namelen = len; call->ident.u.name = name; call->args = NULL; call->nextptr = &call->args; e->kind = E_CALL; e->u.call = call; e->rpnnext = opstack; opstack = e; operator_expected = false; goto token_processed; } } e->kind = E_IDENT; e->u.ident.namelen = len; e->u.ident.u.name = name; break; } /* Unary prefix operators */ case T_SYM_Minus: if (opstack && opstack->kind == E_CALL) { error("Negation in parameter must be written as (-x)"); } e->kind = E_NEGATE; e->rpnnext = opstack; opstack = e; operator_expected = false; goto token_processed; case T_KW_not: e->kind = E_BOOL_NOT; e->rpnnext = opstack; opstack = e; operator_expected = false; goto token_processed; /* Grouping symbols */ case T_SYM_LBracket: if (operator_expected) { error("Unexpected `[`. Arrays indexing does not need []"); } e->u.grouptemp = GK_ARRAY_LITERAL; start_of_subexpr = false; goto grouping_start; case T_SYM_LParen: e->u.grouptemp = GK_GROUPING_PAREN; start_of_subexpr = true; grouping_start: /* Argument and element list (exprlist) parsing. exprlists are stored like this on the out stack: value1 value2 value3 ( TERM TERM TERM CALL */ e->kind = E_GROUP_TEMP; /* Move operators with higher precedence to output stack */ while (opstack) { struct OpInfo st = opinfo[opstack->kind]; if (st.precedence <= CALL_PRECEDENCE) break; OPSTACK_TO_OUT; } e->rpnnext = opstack; opstack = e; operator_expected = false; continue; /* don't clear start_of_subexpr */ case T_SYM_RBracket: case T_SYM_RParen: { bool popped_ops, found; struct Expr *popped; grouping_end: popped_ops = false; found = false; if (opstack && opstack->kind == E_CALL) { /* For example, the end of `(f x y)` */ assert(!operator_expected); operator_expected = true; } /* Pop everything off the stack until a ( or [ is found */ while (opstack) { if (opstack->kind == E_GROUP_TEMP) { found = true; break; } OPSTACK_TO_OUT; popped_ops = true; } if (!found) { error(tok == T_SYM_RBracket ? "Too many `]`" : "Too many `)`"); } if (!operator_expected && popped_ops) { /* E.g. (1 + ) */ error(tok == T_SYM_RBracket ? "Missing operand before `]`" : "Missing operand before `)`"); } /* Pop the grouping ( or [ */ assert(opstack->kind == E_GROUP_TEMP); if (tok == T_SYM_RBracket && opstack->u.grouptemp == GK_GROUPING_PAREN) { error("Expected `)` not `]` here"); } else if (tok == T_SYM_RParen && opstack->u.grouptemp == GK_ARRAY_LITERAL) { error("Expected `]` not `)` after array literal"); } popped = opstack; opstack = opstack->rpnnext; free(popped); if (tok == T_SYM_RBracket) { /* Array literal */ e->kind = E_ARRAY; /* TODO link elements to array? */ } else { /* Grouping parenthesis. It's not added to the output */ assert(tok == T_SYM_RParen); free(e); goto dont_add_operand_to_output; } break; } default: error("Expected an operand"); } assert(e->kind != E_GROUP_TEMP); out_top = e; *out_nextptr = e; out_nextptr = &e->rpnnext; dont_add_operand_to_output: if (opstack && opstack->kind == E_CALL) { link_call_arg(opstack, out_top); operator_expected = false; } else { operator_expected = true; } } else { bool push_seqpoint = false; /* Operator_expected */ /* TODO operators should not be allowed in all contexts (but always inside a parenthesis) */ switch ((int)tok) { case T_SYM_RArrow: e->kind = E_ASSIGN; break; case T_SYM_SingleEqual: e->kind = E_ASSIGN_FINAL; break; case T_SYM_DoubleEqual: e->kind = E_EQUAL; break; case T_SYM_NotEqual: e->kind = E_NOT_EQUAL; break; case T_SYM_Less: e->kind = E_LESS; break; case T_SYM_Greater: e->kind = E_GREATER; break; case T_SYM_LessEqual: e->kind = E_LESS_EQUAL; break; case T_SYM_GreaterEqual: e->kind = E_GREATER_EQUAL; break; case T_SYM_Plus: e->kind = E_ADD; break; case T_SYM_Minus: /* Negation is handled above, with operator_expected==0 */ e->kind = E_SUB; break; case T_SYM_Asterisk: e->kind = E_MUL; break; case T_SYM_Slash: e->kind = E_DIV; break; case T_KW_and: e->kind = E_BOOL_AND; /* TODO varstates (e.g. if o <> none and o.is_xyz */ push_seqpoint = true; break; case T_KW_mod: e->kind = E_MOD; break; case T_KW_not: error("Binary operator expected here"); case T_KW_or: e->kind = E_BOOL_OR; /* TODO varstates (e.g. if o == none or o.is_xyz */ push_seqpoint = true; break; case T_SYM_LBracket: error("Unexpected `[`. Array indexing does not need []"); case T_SYM_LParen: error("Unexpected `(`. Function calls do not need ()"); case T_SYM_RBracket: case T_SYM_RParen: goto grouping_end; /* TODO */ /* TODO multi-expr lines, e.g. `1 2` - check mode - check top of operator stack */ default: error("Expected an operator"); } { struct OpInfo op = opinfo[e->kind]; /* TODO disallow mixing of operators precedences! (may also for * /, since order matters for integer rounding and for overflow)*/ while (opstack) { struct OpInfo st = opinfo[opstack->kind]; if (!st.precedence) break; if (op.precedence == st.precedence) { if (op.no_mixing && opstack->kind != e->kind && !(op.is_addsub && st.is_addsub) && op.precedence != COMPARISON_PRECEDENCE) { error("Ambiguous mix of operators"); } else if (st.right_assoc) { break; } } else if (op.precedence > st.precedence) { break; } OPSTACK_TO_OUT; } } assert(out_top != NULL); e->u.binary.left_id = out_top->id; e->rpnnext = opstack; opstack = e; if (push_seqpoint) { /* for boolean and/or */ struct Expr *seqp = malloc(sizeof(struct Expr)); NO_NULL(seqp); seqp->id = e->id; seqp->rpnnext = NULL; seqp->typeref = NULL; seqp->kind = E_SEQPOINT; seqp->u.seqpoint_end = e; out_top = seqp; *out_nextptr = seqp; out_nextptr = &seqp->rpnnext; } operator_expected = false; } token_processed: start_of_subexpr = false; } finished: /* Move remaining tokens on the operator stack to the output stack */ while (opstack) { if (opstack->kind == E_GROUP_TEMP) { error(opstack->u.grouptemp == GK_ARRAY_LITERAL ? "Missing `]`" : "Missing `)`"); } OPSTACK_TO_OUT; } if (!out || !operator_expected) { error("Incomplete expression"); } return out; }