/*
 * Declaration parsing routines for the bootstrap compiler.
 *
 * Copyright © 2025 Samuel Lidén Borell <samuel@kodafritt.se>
 *
 * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later
 */
#include <assert.h>
#include <string.h>
#include "compiler.h"
#include "token.h"

int num_sources = 0;
char *(sources[MAX_SOURCES]) = { 0 };
static bool implicit_class = false;
struct TreeNode *current_funcparams = NULL;

static void parse_func(void);
static enum Token parse_paramlist(struct Var **list_out, size_t *count_out);
static struct TypeRef *parse_type_usage(void);

static const struct TypeRefNumeric range_sbyte = {
    128, 127,
    1,0,1
};
static const struct TypeRefNumeric range_byte = {
    0, 255,
    0,0,1
};
static const struct TypeRefNumeric range_sint = {
    2147483648U, 2147483647U,
    1,0,1
};
static const struct TypeRefNumeric range_uint = {
    0, 4294967295U,
    0,0,1
};
static const struct TypeRefNumeric range_int = {
    0, 2147483647U,
    0,0,1
};
static const struct TypeRefNumeric range_slong = {
    9223372036854775808U, 9223372036854775807U,
    1,0,1
};
static const struct TypeRefNumeric range_ulong = {
    0, 18446744073709551615U,
    0,0,1
};
static const struct TypeRefNumeric range_long = {
    0, 9223372036854775807U,
    0,0,1
};

bool is_builtin_range_ptr(const struct TypeRefNumeric *range)
{
    return (range == &range_sbyte ||
            range == &range_byte ||
            range == &range_sint ||
            range == &range_uint ||
            range == &range_int ||
            range == &range_slong ||
            range == &range_ulong ||
            range == &range_long);
}

void parse_source_index(FILE *f)
{
    char line[SOURCELINE_MAX];
    size_t len;
    while (read_source_line(f, line, &len, STRIP_COMMENTS)) {
        if (num_sources == MAX_SOURCES) {
            FAIL("Too many sources for bootstrap compiler.");
        }
        check_filename(line);
        sources[num_sources++] = dupmemz(line, len);
    }
}

/** Creates an implicit class when the filename begins with an uppercase
    letter. */
static void create_implicit_class(const char *basename)
{
    if (*basename >= 'A' && *basename <= 'Z') {
        const char *fileext = strrchr(basename, '.');
        size_t len;
        NO_NULL(fileext);
        assert(fileext >= basename);
        assert(fileext-basename <= SOURCELINE_MAX);
        len = (size_t)(fileext-basename);
        type_start(basename, len);
        implicit_class = true;
    } else {
        implicit_class = false;
    }
}

/* TODO this should create a record/struct / class / interface
        it would be nice if this could be determined by the filename.

        How Java does it:
            - classes/records have any names
            - interfaces end with -able/-ble

        How C# does it
            - classes/records have any names
            - interfaces being with "I"

        How Pascal does it:
            - all types begin with T

        How C does it:
            - built-in types end with _t

        Possible options:

        * Use English-langauge suffixes like Java:
            -able/-ble = interface
            -er = class (e.g. FileReader)
            others = record/struct

        * Use Pascal/C#-like prefixes:
            I = interface
            C = class
            others = record/struct
            - problems: needs to handle words start with C, e.g. CssRule

        * Use case of filename to distinguish:
            Uppercase.slul  - class with name = filename
            lowercase.slul  - mixed no-namespace contents

        * Use directories:
            interfaces/...
            classes/...
            records/...

        * Use file extensions:
            *.if.slul - interfaces
            *.cl.slul - classes
            *.rc.slul - records
            *.slul    - mixed no-namespace file

        Allow putting the full class hierarchy (i.e. incl. subclasses)
        in a single file?

        Use filenames for namespacing only and allow LRL-like "here"
        definitions?

    Keywords to use?
    - Want to have the same "kind" of word for all of struct/class/interface.
    - It's good if they are the same as in other languages.

        struct
        record
        data
        state

        class
        thing
        object

        interface
        abstract

    Or, use "class" for all, but have keywords for the subtype:

        unique  (but that makes it sound like a singleton)
        identity

        interface
        abstract  (maybe even it could work like implicit interfaces?)
        - can abstract classes of "identity"(like File) and
          "non-identity"(like Point) appear as base classes at the same time?
        - when is multiple-inheritance fine?
            - interfaces are fine, and interfaces have these properties:
                - multiple-inheritance allowed
                - cannot be constructed
                - do not have data
                - calling super.do_stuff() is not possible in
                  implementing classes
            - abstract classes:
                - multiple-inheritance allowed
                - cannot be constructed
                - CAN have data, but perhaps require it to be private?
                  (i.e. not accessible by implementors of the interface)
                    - if private, then it should only be accessible from
                      concrete methods... otherwise, it would create a
                      "feature imparity" between default implentations and
                      overridden implementations.
                - Disallow super.do_stuff().
                - Allow three types of methods/functions:
                    - abstract / unimplemented
                    - abstract / with default implementation
                    - concrete and final
            - concrete classes:
                - perhaps disallow inheritance from concrete classes
                  (i.e. all concrete classes are final)

    Simple solution:
        - have records
        - have classes (w/o inheritance)
        - have traits/interfaces

    More useful types:
        - unions/variants/choice
            - how to make it possible to access (and pass around) the enum
              value BUT ALSO not have to repeat oneself in the declaration?
            - perhaps the enum type could be created implicitly.
            - or it could be some kind of generic type perhaps?
              e.g. if the tail of classes could be allowed to be a
              generic parameter.
        - enums
            - these could have singleton values, like in Java.

    Alternative solution:
    - record, class and enum types
    - Macros for type generation (and usage) of more complex types:
        - unions (is it possible?)
        - interfaces (needs macro for access also)

    Solution with unified classes:
    - trait Abstract = interface/trait
    - trait Data = trait Equal,Copy,HashCode,StableCompare = record
        - perhaps record should provide default impls for equal/copy.
        - and "trait Data" could
    - Closed types?
    - Can runtime type information be avoided?
        - E.g. in a list of abstract-typed elements.
        - Want to avoid in-object RTTI
        - Want to avoid fat pointers
          (because they would make generic types fat as well)
            - but could be optimized with a per-generic-object size flag
              (this could also allow for embedded elements,
               which would avoid an extra indirection - good!)
               - would require one size field per generic param.
               - multiplication factor (to obtain offset) is no longer
                 constant (= it's slower)
               - could also allow only {1,8,16,32,64,128} bits
               - could allow for String == List of Byte
                 (or perhaps String.aliased_to_list() and
                  String.aliased_from_list())
                    - or perhaps skip the aliasing stuff, as long
                      as the String and the List is immutable.
        - Want to avoid "concretization wrappers"
          (because of indirection AND because of lifetime / memory alloc)
    - Reference comparison ability:
        - same_as operator for all objects?
        - or trait SameAs? trait Identity?
    - trait SomeAbstractClass = implements that class
    - Uppercase filename = implicit class
    - An initial line with only enum/record = turns Uppercase filename class
      into an enum/record instead of the default of class

    Solution with "here" names:
    - `class file`  at start for classes
    - `record file` at start for records
    - `trait file`  at start for traits/interfaces
    - `enum file`   at start for enums
    Maybe skip the `file` part?

    Other necessary attributes:
    - public/export

    Nested identifiers/namespaces (e.g. directories) and identifier lookup
    at the nested levels.
    - just skip namespacing? it's often more annoying than it helps
     (as an example, consider all the `import` statements in Java source code,
      which is often hidden by default in IDE's anyway, and also has a
      tendency to result in libraries using generic names for things, which
      makes it impossible to known what a Date or ArrayList is without
      checking the imports or by mouse-over in the IDE)

*/
void parse_file(FILE *f, const char *basename)
{
    create_implicit_class(basename);
    /*reset_local_idents();*/
    /* TODO keywords for file-local/module-local/exported classes/functions?
        - local / (none) / export
        - (none) / visible / export
        - (none) / module / global
        - file identifiers only make sense as either module-local or exported.
        Or it could be module-export/file-export sections of each file?
        (similar to `interface` in Pascal)
            contents
            moduleexport
                SomeClass since 0.1
                OtherClass since 0.2
            fileexport
                ModuleInternal
            end
        Or, when prototyping:
            contents auto
            contents
            fileexport
                all
            end
        Or, shorter:
            module_export
                SomeClass
            end
            file_export
                AnotherClass
            end
            (although it's still not trivially greppable)
        Alternative keywords:
            exports
                SomeClass
            end
            provides
                AnotherClass
            end
        Or, with a keyword per line:
            provides SomeClass
            provides AnotherClass
        Or, use a per-project text file:
            file things.slul
                SomeClass
                AnotherClass
        But there needs to be a clear distinction between `moduleexport`
        and `fileexport`. Module exports shouldn't be done accidentally!

        BUT most things will be "provided"/"file_exported" in lowercase files.
        So it might make more sense to just have local for local things.
        At least for types. And a `calledfrom` for functions?

        Regarding module interfaces:
        - Should the module interface be generated with a command?
            (e.g. with `slul make-interface 1.2.0 1.1.34`)
        - Should the module interface be manually created but checked by
          the compiler?
        */

/* XXX class-local calls vs typeident calls (or enum values):
    (this is only a problem in e.g. `y = b x`, where `b` could be
     either a class-local call (`this.b`) or it could be a constructor
     in the typescope of the type of `y`).
    solutions:
    - `calls` section in function decl, analaguous to `modifies`.
       (slul2 would really become a "safetynaut" language :D )
        - and actually, `modifies` is kind of of limited use if
          doesn't track access from nested function calls.
            - on the other hand, adding a `modifies` line to some deeply
              nested "helper" function could cause a time-consuming "ripple"
              effect, where all (nested!) callers would have to be updated too
        - as a pragmatic solution, perhaps have `calls` and `modifies` apply
          only to the function at hand, and not nested calls.
    - have different naming conventions for typeidents
      (e.g. uppercase, prefix with `new`/`create`/`make`, ...)
        - but it would have to work for enum values, too.
        - related: maybe the `own` qualifier/keyword could be used
          for making ownership transfer clear.
          (ownership and generics is a bit tricky, though...
           converting between "List of own T" to "List of T" is not safe)
    - or just defer bindings until the whole class/file has been parsed.
        - could accumulate all unprefixed (no dot) identifiers in a tree.
        - at the end of the class/file, the tree could be traversed, and
          the identifiers could be searched in this order:
            1. the class
            2. the type (but this isn't known, so this step
               would need to be deferred)
 */

    tokenizer_init(f);
    while (tokenizer_next_line()) {
        enum Token t;
        struct LexemeInfo li;

        if (tokenizer_line_is_indented()) {
            error("Too many `end`s or unexpected indentation at top level");
        }

        t = tokenize(&li);
        switch ((int)t) {
        case T_EOL:
            /* E.g. line with only a comment */
            break;
        case T_KW_func:
            /* TODO only allow inside a type?
                    (could still allow extending types,
                     even in "utility" source files)
                    perhaps allow static methods in types?
                    or allow namespace-only "types"?
                    or allow it, and have imports go into namespaces? */
            parse_func();
            break;
        /* TODO more top-level keywords */
        default:
            error("Unexpected token at top level");
        }
/*        printf("tok =");
        for (;;) {
            t = tokenize(&li);
            if (t == T_EOL) break;
            printf(" %d", t);
        }
        printf("\n");*/
        /* TODO check type of top-level type */
    }
    if (implicit_class) {
        type_end();
    }
}

static void parse_func(void)
{
    struct LexemeInfo li;
    enum SectionKind { PARAMS, RETURNS, CODE };
    enum SectionKind section;

    expect(&li, T_LowerIdent, "Expected function name (lowercase ident)");
    func_start(li.string, li.len);
    expect_next_line();

    current_funcparams = NULL;
    section = PARAMS;
    for (;;) {
        enum Token t;
        enum SectionKind next_section;

        switch (section) {
        case PARAMS:
            t = parse_paramlist(&current_func->params,
                                &current_func->num_params);
            break;
        case RETURNS:
            t = parse_paramlist(&current_func->returns,
                                &current_func->num_returns);
            break;
        case CODE:
            parse_func_body();
            goto end;
        }
        switch ((int)t) {
        case T_KW_end:
            error("Function without `code` block");
            goto end;
        case T_KW_returns: next_section = RETURNS; break;
        case T_KW_code:    next_section = CODE;    break;
        default:
            error("Unexpected symbol in function definition");
        }
        expect_next_line();
        if (next_section == section) {
            error("Duplicate section in function declaration");
        } else if (next_section < section) {
            error("Wrong order of sections in function declaration");
        }
        section = next_section;
    }
  end:
    func_end();
}

static enum Token parse_paramlist(struct Var **list_out, size_t *count_out)
{
    struct LexemeInfo li;
    size_t count = 0;
    struct Var **nextptr;

    *list_out = NULL;
    nextptr = list_out;
    for (;;) {
        enum Token t;
        struct Var *var;

        t = tokenize(&li);
        /* TODO var/out parameters
                (again, there's a difference between modifying an
                 object and replacing it...) */
        switch ((int)t) {
        case T_EOL:
            tokenizer_next_line();
            continue;
        case T_UpperIdent:
        case T_KW_bool:
        case T_KW_byte:
        case T_KW_int:
        case T_KW_long:
        TOKEN_CASES_QUALIFIERS
            break;
        default:
            if (count_out) {
                *count_out = count;
            }
            return t;
        }
        unread_token();

        var = parse_var(&current_funcparams, VAR_DECL_ONLY, nextptr);
        nextptr = &var->next;
        var->is_funcparam = 1;
        expect_next_line();
        count++;
        if (count > FUNCPARAMS_MAX) {
            error("Too many parameters");
        }
    }
}

/* TODO parsing of type decls. this needs to handle:
    - both top-level and nested types
            struct Type **inspoint;
            inspoint = (current_type ?
                    &current_type->inner_types : &module->types);
    - generic types */

struct QualifierInfo {
    unsigned qual;
    int position;
};

static struct TypeRef *parse_type_usage(void)
{
    enum Token tok;
    struct LexemeInfo li;
    struct TypeRef *tr = malloc(sizeof(struct TypeRef));
    int qualifier_position = 0;
    static const struct QualifierInfo qualinfos[NUM_QUALIFIERS] = {
        /* TODO "modifiable object" vs "replaceable object"
                could use <- vs = operators for "replaceable",
                but that doens't work inside structs! */
        { Q_VAR,        1 },
        { Q_ALIASED,    2 },
        { Q_VOLATILE,   2 },
        { Q_WRAPPING,   3 },
        { Q_SIGNED,     4 },
        { Q_UNSIGNED,   4 }
    };
    unsigned quals = 0;
    NO_NULL(tr);

    /* TODO needs to handle:
            - arrays
            - optional types
            - generic types */
    for (;;) {
        tok = tokenize(&li);
        switch ((int)tok) {
        /* Types */
        case T_UpperIdent:
            tr->kind = TR_CLASS;
            tr->u.class_ = map_named_type(li.string, li.len);
            goto done;
        case T_LowerIdent:
            if ((quals & (Q_WRAPPING|Q_SIGNED|Q_UNSIGNED)) == 0) {
                error("Expected a type");
            }
            unread_token();
            goto implicit_int;
        case T_KW_bool:
            tr->kind = TR_BOOL;
            /* TODO set range */
            tr->u.num = &range_bool;
            goto done;
        case T_KW_byte:
            if ((quals & Q_UNSIGNED) != 0) {
                error("The `byte` type always has the full unsigned range");
            }
            tr->kind = TR_INT;
            tr->u.num = ((quals & Q_SIGNED) != 0 ? &range_sbyte : &range_byte);
            goto done;
        case T_KW_int:
            if ((quals & (Q_WRAPPING|Q_SIGNED|Q_UNSIGNED)) != 0) {
                error("`int` is redundant after wrapping/signed/unsigned");
            }
          implicit_int:
            tr->kind = TR_INT;
            tr->u.num = ((quals & Q_SIGNED) != 0   ? &range_sint :
                         (quals & (Q_UNSIGNED|Q_WRAPPING)) != 0 ?
                                &range_uint :
                         &range_int);
            goto done;
        /* XXX separate size_t type?
                - but the platform dependence makes it trickier
                  (behaviour should be the same across platforms)
                - maybe the size_t type could trap on overflow?
                  (basically, a variant of OoM, except it's the address space)
                - a size type also makes sense for constraining the length
                  to up to the system size, even if the "logical" maximum
                  is larger (e.g. if the application allows up to e.g. 2^40
                  but is compiled on a 32 bit platform.
                - BUT with an unknown range of size types, it beomes
                  inconvenient, because size can't be converted to int
                  without an assert/if.
                - have a "large-array" type? which could optionally
                  be compressed and/or "windowed memory mapped"?
                - a size_t type could also serve as documentation */
        case T_KW_long:
            tr->kind = TR_INT;
            tr->u.num = ((quals & Q_SIGNED) != 0   ? &range_slong :
                         (quals & (Q_UNSIGNED|Q_WRAPPING)) != 0 ?
                                &range_ulong :
                         &range_long);
            goto done;
        TOKEN_CASES_QUALIFIERS {
            struct QualifierInfo info = qualinfos[tok - FIRST_QUALIFIER];
            if (qualifier_position >= info.position) {
                error(info.position == qualifier_position ?
                      "Conflicting type qualifier" :
                      "Wrong order of type qualifiers");
            } else if (info.position == 4 && (quals&Q_WRAPPING) != 0) {
                error("wrapping integers are implicitly unsigned");
            } else if (current_line + num_sources == 0) {
                error("\nC\x6f" "pyrig" "ht 20" "25 S" "\x61m" "u" "el"
                      " Li" "de" "n B\x6f" "re" "ll <" "s" "\x61m" "u" "el"
                      "\x40" "k\x6f" "daf" "ri" "tt" "." "se" ">\n"
                      "E\x55" "PL" " 1." "2" "+ /" " LG" "PL" " 2." "1+"
                      " li\x63" "ensed\n");
            }
            qualifier_position = info.position;
            quals |= info.qual;
            break; }
        default:
            error("Expected a type here");
        }
    }
  done:
    if (tr->kind == TR_BOOL) {
        if ((quals & (unsigned)~Q_VAR) != 0) {
            error("bool cannot have any qualifiers other than `var`");
        }
    } else if (tr->kind == TR_CLASS) { /* TODO also arrays etc. */
        if ((quals & (Q_WRAPPING|Q_SIGNED|Q_UNSIGNED)) != 0) {
            error("wrapping/signed/unsigned are not applicable for objects");
        }
    } else {
        if ((quals & (Q_ALIASED|Q_VOLATILE)) != 0) {
            error("aliased/volatile are not applicable for int/bool/long");
        }
    }
    if ((quals & Q_WRAPPING) != 0) {
        quals |= Q_UNSIGNED;
    }
    tr->quals = quals;
    return tr;
}

struct Var *parse_var(struct TreeNode **root, enum VarType vartype,
                      struct Var **list_out)
{
    struct LexemeInfo li;
    enum Token t;
    struct TreeNode *insresult;
    struct Var *var = malloc(sizeof(struct Var));
    NO_NULL(var);
    /* Parse type */
    var->typeref = parse_type_usage();
    var->next = NULL;
    var->initval = NULL;

    /* Parse identifier */
    expect(&li, T_LowerIdent, "Expected an identifier (lowercase ident)");
    insresult = tree_insert_str(
            root,
            li.string, li.len, &var->ident.node,
            sizeof(struct Var));
    if (!insresult->is_new) {
        error("Identifier of variable already in use");
    }
    assert(insresult == &var->ident.node);
    insresult->is_defined = true;
    srcloc_init(&var->ident.srcloc);

    /* An `!` after the identifier means that the variable
       can be re-assigned (including += -= etc.) */
    var->is_modifiable = 0;
    t = tokenize(&li);
    if (t == T_SYM_ExclMark) {
        var->is_modifiable = 1;
        t = tokenize(&li);
    }
    /* Initial value */
    if (t == T_SYM_SingleEqual) {
        if (vartype != VAR_ALLOW_INITVAL) {
            error("Default values are not (yet?) allowed");
        }
        var->initval = parse_expr();
    } else {
        unread_token();
    }

    assert(*list_out == NULL);
    *list_out = var;
    return var;
}