/* * Tokenization routines for the bootstrap compiler. * * Copyright © 2025 Samuel Lidén Borell * * SPDX-License-Identifier: EUPL-1.2+ */ #include #include "compiler.h" #include "token.h" #define DONE_1CH(token) do { t = (token); goto done_1ch; } while (0) #define DONE_2CH(token) do { t = (token); goto done_2ch; } while (0) static FILE *f; static char line[SOURCELINE_MAX]; static const char *s; static const char *last_token_start; static enum Token tok_alphanum(struct LexemeInfo *li_out); static enum Token tok_number(struct LexemeInfo *li_out); static enum Token tok_string(struct LexemeInfo *li_out); void tokenizer_init(FILE *file) { f = file; s = NULL; } bool tokenizer_next_line(void) { if (s && *s) { error("Expected end of line"); } s = line; /* Don't strip comments here, because '#' might appear inside strings, which read_source_line() doesn't know about. */ return read_source_line(f, line, NULL, KEEP_COMMENTS); } bool tokenizer_line_is_indented(void) { return line[0] == ' ' || line[0] == '\t'; } enum Token tokenize(struct LexemeInfo *li_out) { enum Token t; size_t len; char c; memset(li_out, 99, sizeof(*li_out)); while (*s == ' ' || *s == '\t') s++; last_token_start = s; c = *s; if (c == '\0' || c == '#') { /* read_source_line() strips \n and \r */ t = T_EOL; s = ""; len = 0; goto done; } else if (c <= 31 || c >= 127) { error("Non-ASCII (or control) character outside string/comment"); } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') { return tok_alphanum(li_out); } else if (c >= '0' && c <= '9') { return tok_number(li_out); } else if (c == '"') { return tok_string(li_out); } else { s++; switch (c) { case '=': if (*s == '=') DONE_2CH(T_SYM_DoubleEqual); DONE_1CH(T_SYM_SingleEqual); case '<': if (*s == '-') DONE_2CH(T_SYM_LArrow); if (*s == '>') DONE_2CH(T_SYM_NotEqual); if (*s == '=') DONE_2CH(T_SYM_LessEqual); DONE_1CH(T_SYM_Less); case '>': if (*s == '=') DONE_2CH(T_SYM_GreaterEqual); DONE_1CH(T_SYM_Greater); case '.': /* TODO "..." for imcomplete stuff? */ DONE_1CH(T_SYM_Dot); case '+': DONE_1CH(T_SYM_Plus); case '-': if (*s == '>') DONE_2CH(T_SYM_RArrow); DONE_1CH(T_SYM_Minus); case '*': DONE_1CH(T_SYM_Asterisk); case '/': DONE_1CH(T_SYM_Slash); case '(': DONE_1CH(T_SYM_LParen); case ')': DONE_1CH(T_SYM_RParen); case '[': DONE_1CH(T_SYM_LBracket); case ']': DONE_1CH(T_SYM_RBracket); default: s--; error("Special character outside string/comment"); } } unreachable(); done_1ch: len = 1; goto done; done_2ch: len = 2; s++; goto done; done: li_out->len = len; return t; } #define CMP_KW(length, keyword) \ if (memcmp(start, #keyword, (length)) == 0) { \ t = T_KW_##keyword; \ goto is_keyword; \ } static enum Token tok_alphanum(struct LexemeInfo *li_out) { enum Token t; size_t len = 0; const char *start = s; char c; do { c = *(++s); len++; } while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'); switch (len) { case 2: CMP_KW(2, if) CMP_KW(2, in) CMP_KW(2, io) CMP_KW(2, of) CMP_KW(2, or) CMP_KW(2, to) break; case 3: CMP_KW(3, and) CMP_KW(3, end) CMP_KW(3, for) CMP_KW(3, int) CMP_KW(3, mod) CMP_KW(3, not) /* XXX should there be `and`/`or`, or `all_of`/`any_of`/`none_of`? maybe there should be a T... safe-vararg, like in Java. */ CMP_KW(3, var) break; case 4: CMP_KW(4, bool) CMP_KW(4, byte) CMP_KW(4, case) CMP_KW(4, code) CMP_KW(4, elif) CMP_KW(4, else) CMP_KW(4, enum) CMP_KW(4, from) CMP_KW(4, func) CMP_KW(4, long) CMP_KW(4, none) CMP_KW(4, sets) CMP_KW(4, true) break; case 5: CMP_KW(5, break) CMP_KW(5, class) CMP_KW(5, false) CMP_KW(5, local) CMP_KW(5, reads) CMP_KW(5, trait) CMP_KW(5, while) break; case 6: CMP_KW(6, assert) CMP_KW(6, export) CMP_KW(6, record) CMP_KW(6, return) CMP_KW(6, signed) CMP_KW(6, switch) break; case 7: CMP_KW(7, aliased) CMP_KW(7, default) CMP_KW(7, loopend) CMP_KW(7, returns) CMP_KW(7, section) break; case 8: CMP_KW(8, continue) CMP_KW(8, modifies) CMP_KW(8, unsigned) CMP_KW(8, volatile) CMP_KW(8, wrapping) break; case 9: CMP_KW(9, loopempty) CMP_KW(9, templates) break; case 10: CMP_KW(10, calledfrom) break; } /* No keyword matches. It's an identifier */ li_out->len = len; li_out->string = start; return *start > 'Z' ? T_LowerIdent : T_UpperIdent; is_keyword: return t; } static enum Token tok_number(struct LexemeInfo *li_out) { uint64_t num = 0, limit; unsigned base; const char *num_start; base = 10; limit = 0x1999999999999999; /* 2^64 / 10 */ if (*s == '0') { switch (s[1]) { case 'b': base = 2; limit = 0x8000000000000000; /* 2^64 / 2 */ goto skip_base; case 'x': base = 16; limit = 0x1000000000000000; /* 2^64 / 16 */ skip_base: s += 2; if (*s == '_') { error("Leading underscore in number"); } break; case 'B': case 'X': error("Base (`x` or `b`) must be lowercase"); } } num_start = s; for (;; s++) { char c = *s; unsigned digit; if (c >= '0' && c <= '9') { digit = (unsigned)c - '0'; have_digit: if (digit >= base) { if (base == 10 && digit == 14 /* "e" */) { goto float_num; } error("Invalid digit in number"); } if (num > limit) { error("Number too large (too many digits)"); } num *= base; if (num > 0xFFFFFFFFFFFFFFFF - digit) { error("Number too large"); } num += digit; } else if (c >= 'A' && c <= 'Z') { digit = 10 + (unsigned)(c - 'A'); goto have_digit; } else if (c >= 'a' && c <= 'z') { digit = 10 + (unsigned)(c - 'a'); goto have_digit; } else if (c == '.') { float_num: error("Sorry, floating point is not implemented in the bootstrap compiler"); } else if (c == '_') { int next = s[1]; if (next == '_') { error("Repeated underscores in number"); } else if ((next < '0' || next > '9') && (next < 'a' || next > 'z') && (next < 'A' || next > 'Z')) { error("Trailing `_` in number"); } } else { break; } } if (s == num_start) { error("No digits after base prefix"); } li_out->num = num; return T_Integer; } static enum Token tok_string(struct LexemeInfo *li_out) { const char *start = ++s; for (;;) { char c = *(s++); if (c == '\0') { error("Missing terminating \" of string"); } else if (c == '"') { /* End of string */ break; } else if (c == '\\') { /* Escape sequence - Handled in parser */ if (*(s++) == '\0') { error("Unexpected end of string in escape sequence"); } } else { /* Regular character */ } } li_out->string = start; li_out->len = (size_t)(s - start - 1); return T_String; } void unread_token(void) { s = last_token_start; } void expect(struct LexemeInfo *li_out, enum Token expected, const char *errmsg) { enum Token t = tokenize(li_out); if (t != expected) { error(errmsg); } } void expect_next_line(void) { struct LexemeInfo li; expect(&li, T_EOL, "Expected line break"); if (!tokenizer_next_line()) { error("Unexpected end of file"); } }