/*
 * Tokenization routines for the bootstrap compiler.
 *
 * Copyright © 2025 Samuel Lidén Borell <samuel@kodafritt.se>
 *
 * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later
 */
#include <string.h>
#include "compiler.h"
#include "token.h"

#define DONE_1CH(token) do { t = (token); goto done_1ch; } while (0)
#define DONE_2CH(token) do { t = (token); goto done_2ch; } while (0)

static FILE *f;
static char line[SOURCELINE_MAX];
static const char *s;
static const char *last_token_start;

static enum Token tok_alphanum(struct LexemeInfo *li_out);
static enum Token tok_number(struct LexemeInfo *li_out);
static enum Token tok_string(struct LexemeInfo *li_out);

void tokenizer_init(FILE *file)
{
    f = file;
    s = NULL;
}

bool tokenizer_next_line(void)
{
    int ignorelevel = 0;
    if (s && *s) {
        error("Expected end of line");
    }
    s = line;
    /* Return first line not in an ignore block */
    for (;;) {
        /* Don't strip comments here, because '#' might appear inside strings,
           which read_source_line() doesn't know about. */
        if (!read_source_line(f, line, NULL, KEEP_COMMENTS)) {
            if (ignorelevel) {
                warning("`ignore` block not closed with `end`");
            }
            return false;
        }

        if (!strcmp(line, "ignore")) {
            if (ignorelevel++ > 1000) {
                error("Suspiciously deep ignore block. Aborting");
            }
        } else if (!ignorelevel) {
            return true;
        } else if (!strcmp(line, "end")) {
            ignorelevel--;
        }
    }
}

bool tokenizer_line_is_indented(void)
{
    return line[0] == ' ' || line[0] == '\t';
}


enum Token tokenize(struct LexemeInfo *li_out)
{
    enum Token t;
    size_t len;
    char c;

    memset(li_out, 99, sizeof(*li_out));
    while (*s == ' ' || *s == '\t') s++;

    last_token_start = s;
    c = *s;
    if (c == '\0' || c == '#') {
        /* read_source_line() strips \n and \r */
        t = T_EOL;
        s = "";
        len = 0;
        goto done;
    } else if (c <= 31 || c >= 127) {
        error("Non-ASCII (or control) character outside string/comment");
    } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
        return tok_alphanum(li_out);
    } else if (c >= '0' && c <= '9') {
        return tok_number(li_out);
    } else if (c == '"') {
        return tok_string(li_out);
    } else {
        s++;
        switch (c) {
        case '=':
            if (*s == '=') DONE_2CH(T_SYM_DoubleEqual);
            DONE_1CH(T_SYM_SingleEqual);
        case '<':
            if (*s == '>') DONE_2CH(T_SYM_NotEqual);
            if (*s == '=') DONE_2CH(T_SYM_LessEqual);
            DONE_1CH(T_SYM_Less);
        case '>':
            if (*s == '=') DONE_2CH(T_SYM_GreaterEqual);
            DONE_1CH(T_SYM_Greater);
        case '.':
            DONE_1CH(T_SYM_Dot);
        case '!':
            DONE_1CH(T_SYM_ExclMark);
        case '+':
            DONE_1CH(T_SYM_Plus);
        case '-':
            DONE_1CH(T_SYM_Minus);
        case '*':
            DONE_1CH(T_SYM_Asterisk);
        case '/':
            DONE_1CH(T_SYM_Slash);
        case '(':
            DONE_1CH(T_SYM_LParen);
        case ')':
            DONE_1CH(T_SYM_RParen);
        case '[':
            DONE_1CH(T_SYM_LBracket);
        case ']':
            DONE_1CH(T_SYM_RBracket);
        default:
            s--;
            error("Special character outside string/comment");
        }
    }
    unreachable();
  done_1ch:
    len = 1;
    goto done;
  done_2ch:
    len = 2;
    s++;
    goto done;
  done:
    li_out->len = len;
    return t;
}


#define CMP_KW(length, keyword)                     \
    if (memcmp(start, #keyword, (length)) == 0) {   \
        t = T_KW_##keyword;                         \
        goto is_keyword;                            \
    }

static enum Token tok_alphanum(struct LexemeInfo *li_out)
{
    enum Token t;
    size_t len = 0;
    const char *start = s;
    char c;

    do {
        c = *(++s);
        len++;
    } while ((c >= 'a' && c <= 'z') ||
             (c >= 'A' && c <= 'Z') ||
             (c >= '0' && c <= '9') || c == '_');

    switch (len) {
    case 2:
        CMP_KW(2, if)
        CMP_KW(2, in)
        CMP_KW(2, io)
        CMP_KW(2, of)
        CMP_KW(2, or)
        CMP_KW(2, to)
        break;
    case 3:
        CMP_KW(3, and)
        CMP_KW(3, end)
        CMP_KW(3, for)
        CMP_KW(3, int)
        CMP_KW(3, mod)
        CMP_KW(3, not)
        /* XXX should there be `and`/`or`, or `all_of`/`any_of`/`none_of`?
               maybe there should be a T... safe-vararg, like in Java. */
        CMP_KW(3, var)
        break;
    case 4:
        CMP_KW(4, bool)
        CMP_KW(4, byte)
        CMP_KW(4, case)
        CMP_KW(4, code)
        CMP_KW(4, elif)
        CMP_KW(4, else)
        CMP_KW(4, enum)
        CMP_KW(4, from)
        CMP_KW(4, func)
        CMP_KW(4, long)
        CMP_KW(4, none)
        CMP_KW(4, sets)
        CMP_KW(4, true)
        break;
    case 5:
        CMP_KW(5, break)
        CMP_KW(5, class)
        CMP_KW(5, false)
        CMP_KW(5, local)
        CMP_KW(5, reads)
        CMP_KW(5, trait)
        CMP_KW(5, while)
        break;
    case 6:
        CMP_KW(6, assert)
        CMP_KW(6, export)
        CMP_KW(6, ignore)
        CMP_KW(6, record)
        CMP_KW(6, return)
        CMP_KW(6, signed)
        CMP_KW(6, switch)
        break;
    case 7:
        CMP_KW(7, aliased)
        CMP_KW(7, default)
        CMP_KW(7, loopend)
        CMP_KW(7, returns)
        CMP_KW(7, section)
        break;
    case 8:
        CMP_KW(8, continue)
        CMP_KW(8, modifies)
        CMP_KW(8, unsigned)
        CMP_KW(8, volatile)
        CMP_KW(8, wrapping)
        break;
    case 9:
        CMP_KW(9, loopempty)
        CMP_KW(9, templates)
        break;
    case 10:
        CMP_KW(10, calledfrom)
        break;
    }
    /* No keyword matches. It's an identifier */
    li_out->len = len;
    li_out->string = start;
    return *start > 'Z' ? T_LowerIdent : T_UpperIdent;
  is_keyword:
    return t;
}

static enum Token tok_number(struct LexemeInfo *li_out)
{
    SlulInt num = 0, limit;
    unsigned base;
    const char *num_start;

    base = 10;
    limit = 0x19999999; /* 2^32 / 10 */
    if (*s == '0') {
        switch (s[1]) {
        case 'b':
            base = 2;
            limit = 0x80000000; /* 2^32 / 2 */
            goto skip_base;
        case 'x':
            base = 16;
            limit = 0x10000000; /* 2^32 / 16 */
          skip_base:
            s += 2;
            if (*s == '_') {
                error("Leading underscore in number");
            }
            break;
        case 'B':
        case 'X':
            error("Base (`x` or `b`) must be lowercase");
        }
    }

    num_start = s;
    for (;; s++) {
        char c = *s;
        unsigned digit;
        if (c >= '0' && c <= '9') {
            digit = (unsigned)c - '0';
          have_digit:
            if (digit >= base) {
                if (base == 10 && digit == 14 /* "e" */) {
                    goto float_num;
                }
                error("Invalid digit in number");
            }
            if (num > limit) {
                error("Number too large (too many digits)");
            }
            num *= base;
            if (num > 0xFFFFFFFF - digit) {
                error("Number too large");
            }
            num += digit;
        } else if (c >= 'A' && c <= 'Z') {
            digit = 10 + (unsigned)(c - 'A');
            goto have_digit;
        } else if (c >= 'a' && c <= 'z') {
            digit = 10 + (unsigned)(c - 'a');
            goto have_digit;
        } else if (c == '.') {
          float_num:
            error("Floating point is not supported in the bootstrap compiler");
        } else if (c == '_') {
            int next = s[1];
            if (next == '_') {
                error("Repeated underscores in number");
            } else if ((next < '0' || next > '9') &&
                       (next < 'a' || next > 'z') &&
                       (next < 'A' || next > 'Z')) {
                error("Trailing `_` in number");
            }
        } else {
            break;
        }
    }
    if (s == num_start) {
        error("No digits after base prefix");
    }
    li_out->num = num;
    return T_Integer;
}

static enum Token tok_string(struct LexemeInfo *li_out)
{
    const char *start = ++s;

    for (;;) {
        char c = *(s++);
        if (c == '\0') {
            error("Missing terminating \" of string");
        } else if (c == '"') {
            /* End of string */
            break;
        } else if (c == '\\') {
            /* Escape sequence - Handled in parser */
            if (*(s++) == '\0') {
                error("Unexpected end of string in escape sequence");
            }
        } else {
            /* Regular character */
        }
    }

    li_out->string = start;
    li_out->len = (size_t)(s - start - 1);
    return T_String;
}

void unread_token(void)
{
    s = last_token_start;
}

void expect(struct LexemeInfo *li_out, enum Token expected,
            const char *errmsg)
{
    enum Token t = tokenize(li_out);
    if (t != expected) {
        error(errmsg);
    }
}

void expect_next_line(void)
{
    struct LexemeInfo li;
    expect(&li, T_EOL, "Expected line break");
    if (!tokenizer_next_line()) {
        error("Unexpected end of file");
    }
}