/* * Declaration parsing routines for the bootstrap compiler. * * Copyright © 2025 Samuel Lidén Borell * * SPDX-License-Identifier: EUPL-1.2+ */ #include #include #include "compiler.h" #include "token.h" int num_sources = 0; char *(sources[MAX_SOURCES]) = { 0 }; static bool implicit_class = false; static void parse_func(void); static enum Token parse_paramlist(struct Var **list_out, size_t *count_out); static struct TypeRef *parse_type_usage(void); void parse_source_index(FILE *f) { char line[SOURCELINE_MAX]; size_t len; while (read_source_line(f, line, &len, STRIP_COMMENTS)) { if (num_sources == MAX_SOURCES) { FAIL("Too many sources for bootstrap compiler."); } check_filename(line); sources[num_sources++] = memzdup(line, len); } } /** Creates an implicit class when the filename begins with an uppercase letter. */ static void create_implicit_class(const char *basename) { if (*basename >= 'A' && *basename <= 'Z') { const char *fileext = strrchr(basename, '.'); size_t len; NO_NULL(fileext); assert(fileext >= basename); assert(fileext-basename <= SOURCELINE_MAX); len = (size_t)(fileext-basename); type_start(basename, len); implicit_class = true; } else { implicit_class = false; } } /* TODO this should create a record/struct / class / interface it would be nice if this could be determined by the filename. How Java does it: - classes/records have any names - interfaces end with -able/-ble How C# does it - classes/records have any names - interfaces being with "I" How Pascal does it: - all types begin with T How C does it: - built-in types end with _t Possible options: * Use English-langauge suffixes like Java: -able/-ble = interface -er = class (e.g. FileReader) others = record/struct * Use Pascal/C#-like prefixes: I = interface C = class others = record/struct - problems: needs to handle words start with C, e.g. CssRule * Use case of filename to distinguish: Uppercase.slul - class with name = filename lowercase.slul - mixed no-namespace contents * Use directories: interfaces/... classes/... records/... * Use file extensions: *.if.slul - interfaces *.cl.slul - classes *.rc.slul - records *.slul - mixed no-namespace file Allow putting the full class hierarchy (i.e. incl. subclasses) in a single file? Use filenames for namespacing only and allow LRL-like "here" definitions? Keywords to use? - Want to have the same "kind" of word for all of struct/class/interface. - It's good if they are the same as in other languages. struct record data state class thing object interface abstract Or, use "class" for all, but have keywords for the subtype: unique (but that makes it sound like a singleton) identity interface abstract (maybe even it could work like implicit interfaces?) - can abstract classes of "identity"(like File) and "non-identity"(like Point) appear as base classes at the same time? - when is multiple-inheritance fine? - interfaces are fine, and interfaces have these properties: - multiple-inheritance allowed - cannot be constructed - do not have data - calling super.do_stuff() is not possible in implementing classes - abstract classes: - multiple-inheritance allowed - cannot be constructed - CAN have data, but perhaps require it to be private? (i.e. not accessible by implementors of the interface) - if private, then it should only be accessible from concrete methods... otherwise, it would create a "feature imparity" between default implentations and overridden implementations. - Disallow super.do_stuff(). - Allow three types of methods/functions: - abstract / unimplemented - abstract / with default implementation - concrete and final - concrete classes: - perhaps disallow inheritance from concrete classes (i.e. all concrete classes are final) Simple solution: - have records - have classes (w/o inheritance) - have traits/interfaces More useful types: - unions/variants/choice - how to make it possible to access (and pass around) the enum value BUT ALSO not have to repeat oneself in the declaration? - perhaps the enum type could be created implicitly. - or it could be some kind of generic type perhaps? e.g. if the tail of classes could be allowed to be a generic parameter. - enums - these could have singleton values, like in Java. Alternative solution: - record, class and enum types - Macros for type generation (and usage) of more complex types: - unions (is it possible?) - interfaces (needs macro for access also) Solution with unified classes: - trait Abstract = interface/trait - trait Data = trait Equal,Copy,HashCode,StableCompare = record - perhaps record should provide default impls for equal/copy. - and "trait Data" could - Closed types? - Can runtime type information be avoided? - E.g. in a list of abstract-typed elements. - Want to avoid in-object RTTI - Want to avoid fat pointers (because they would make generic types fat as well) - but could be optimized with a per-generic-object size flag (this could also allow for embedded elements, which would avoid an extra indirection - good!) - would require one size field per generic param. - multiplication factor (to obtain offset) is no longer constant (= it's slower) - could also allow only {1,8,16,32,64,128} bits - could allow for String == List of Byte (or perhaps String.aliased_to_list() and String.aliased_from_list()) - or perhaps skip the aliasing stuff, as long as the String and the List is immutable. - Want to avoid "concretization wrappers" (because of indirection AND because of lifetime / memory alloc) - Reference comparison ability: - same_as operator for all objects? - or trait SameAs? trait Identity? - trait SomeAbstractClass = implements that class - Uppercase filename = implicit class - An initial line with only enum/record = turns Uppercase filename class into an enum/record instead of the default of class Solution with "here" names: - `class file` at start for classes - `record file` at start for records - `trait file` at start for traits/interfaces - `enum file` at start for enums Maybe skip the `file` part? Other necessary attributes: - public/export Nested identifiers/namespaces (e.g. directories) and identifier lookup at the nested levels. - just skip namespacing? it's often more annoying than it helps (as an example, consider all the `import` statements in Java source code, which is often hidden by default in IDE's anyway, and also has a tendency to result in libraries using generic names for things, which makes it impossible to known what a Date or ArrayList is without checking the imports or by mouse-over in the IDE) */ void parse_file(FILE *f, const char *basename) { create_implicit_class(basename); /*reset_local_idents();*/ /* TODO keywords for file-local/module-local/exported classes/functions? - local / (none) / export - (none) / visible / export - (none) / module / global - file identifiers only make sense as either module-local or exported. Or it could be module-export/file-export sections of each file? (similar to `interface` in Pascal) contents moduleexport SomeClass since 0.1 OtherClass since 0.2 fileexport ModuleInternal end Or, when prototyping: contents auto contents fileexport all end Or, shorter: module_export SomeClass end file_export AnotherClass end (although it's still not trivially greppable) Alternative keywords: exports SomeClass end provides AnotherClass end Or, with a keyword per line: provides SomeClass provides AnotherClass Or, use a per-project text file: file things.slul SomeClass AnotherClass But there needs to be a clear distinction between `moduleexport` and `fileexport`. Module exports shouldn't be done accidentally! BUT most things will be "provided"/"file_exported" in lowercase files. So it might make more sense to just have local for local things. At least for types. And a `calledfrom` for functions? Regarding module interfaces: - Should the module interface be generated with a command? (e.g. with `slul make-interface 1.2.0 1.1.34`) - Should the module interface be manuall created but checked by the compiler? */ /* XXX class-local calls vs typeident calls (or enum values): (this is only a problem in e.g. `y = b x`, where `b` could be either a class-local call (`this.b`) or it could be a constructor in the typescope of the type of `y`). solutions: - `calls` section in function decl, analaguous to `modifies`. (slul2 would really become a "safetynaut" language :D ) - and actually, `modifies` is kind of of limited use if doesn't track access from nested function calls. - on the other hand, adding a `modifies` line to some deeply nested "helper" function could cause a time-consuming "ripple" effect, where all (nested!) callers would have to be updated too - as a pragmatic solution, perhaps have `calls` and `modifies` apply only to the function at hand, and not nested calls. - have different naming conventions for typeidents (e.g. uppercase, prefix with `new`/`create`/`make`, ...) - but it would have to work for enum values, too. - related: maybe the `own` qualifier/keyword could be used for making ownership transfer clear. (ownership and generics is a bit tricky, though... converting between "List of own T" to "List of T" is not safe) - or just defer bindings until the whole class/file has been parsed. - could accumulate all unprefixed (no dot) identifiers in a tree. - at the end of the class/file, the tree could be traversed, and the identifiers could be searched in this order: 1. the class 2. the type (but this isn't known, so this step would need to be deferred) */ tokenizer_init(f); while (tokenizer_next_line()) { enum Token t; struct LexemeInfo li; if (tokenizer_line_is_indented()) { error("Too many `end`s or unexpected indentation at top level"); } t = tokenize(&li); switch ((int)t) { case T_EOL: /* E.g. line with only a comment */ break; case T_KW_func: /* TODO only allow inside a type? (could still allow extending types, even in "utility" source files) perhaps allow static methods in types? or allow namespace-only "types"? or allow it, and have imports go into namespaces? */ parse_func(); break; /* TODO more top-level keywords */ default: error("Unexpected token at top level"); } /* printf("tok ="); for (;;) { t = tokenize(&li); if (t == T_EOL) break; printf(" %d", t); } printf("\n");*/ /* TODO check type of top-level type */ } if (implicit_class) { type_end(); } } static void parse_func(void) { struct LexemeInfo li; enum SectionKind { PARAMS, RETURNS, CODE }; enum SectionKind section; expect(&li, T_LowerIdent, "Expected function name (lowercase ident)"); func_start(li.string, li.len); expect_next_line(); section = PARAMS; for (;;) { enum Token t; enum SectionKind next_section; switch (section) { case PARAMS: t = parse_paramlist(¤t_func->params, ¤t_func->num_params); break; case RETURNS: t = parse_paramlist(¤t_func->returns, ¤t_func->num_returns); break; case CODE: parse_func_body(); goto end; } switch ((int)t) { case T_KW_end: goto end; case T_KW_returns: next_section = RETURNS; break; case T_KW_code: next_section = CODE; break; default: error("Unexpected symbol in function definition"); } expect_next_line(); if (next_section == section) { error("Duplicate section in function declaration"); } else if (next_section < section) { error("Wrong order of sections in function declaration"); } section = next_section; } end: func_end(); } static enum Token parse_paramlist(struct Var **list_out, size_t *count_out) { struct LexemeInfo li; size_t count = 0; struct Var **nextptr; *list_out = NULL; nextptr = list_out; for (;;) { enum Token t; struct Var *var; t = tokenize(&li); /* TODO var/out parameters (again, there's a difference between modifying an object and replacing it...) */ switch ((int)t) { case T_UpperIdent: case T_KW_bool: case T_KW_byte: case T_KW_int: case T_KW_long: TOKEN_CASES_QUALIFIERS break; default: if (count_out) { *count_out = count; } return t; } unread_token(); var = parse_var(¤t_func->vars, VAR_DECL_ONLY, nextptr); nextptr = &var->next; var->is_funcparam = 1; expect_next_line(); count++; if (count > FUNCPARAMS_MAX) { error("Too many parameters"); } } } /* TODO parsing of type decls. this needs to handle: - both top-level and nested types struct Type **inspoint; inspoint = (current_type ? ¤t_type->inner_types : &module->types); - generic types */ struct QualifierInfo { unsigned qual; int position; }; static struct TypeRef *parse_type_usage(void) { enum Token tok; struct LexemeInfo li; struct TypeRef *tr = malloc(sizeof(struct TypeRef)); int qualifier_position = 0; static const struct QualifierInfo qualinfos[NUM_QUALIFIERS] = { /* TODO "modifiable object" vs "replaceable object" could use <- vs = operators for "replaceable", but that doens't work inside structs! */ { Q_VAR, 1 }, { Q_ALIASED, 2 }, { Q_VOLATILE, 2 }, { Q_WRAPPING, 3 }, { Q_SIGNED, 4 }, { Q_UNSIGNED, 4 } }; unsigned quals = 0; NO_NULL(tr); /* TODO needs to handle: - arrays - optional types - generic types */ for (;;) { tok = tokenize(&li); switch ((int)tok) { /* Types */ case T_UpperIdent: tr->kind = TR_CLASS; tr->u.class_ = map_named_type(li.string, li.len); goto done; case T_LowerIdent: if ((quals & (Q_WRAPPING|Q_SIGNED|Q_UNSIGNED)) == 0) { error("Expected a type"); } unread_token(); goto implicit_int; case T_KW_bool: tr->kind = TR_BOOL; goto done; case T_KW_byte: if ((quals & Q_UNSIGNED) != 0) { error("The `byte` type always has the full unsigned range"); } tr->kind = TR_INT; /* TODO set range */ goto done; case T_KW_int: if ((quals & (Q_WRAPPING|Q_SIGNED|Q_UNSIGNED)) != 0) { error("Redundant `int` keyword after wrapping/signed/unsigned"); } implicit_int: tr->kind = TR_INT; /* TODO set range */ goto done; /* XXX separate size_t type? - but the platform dependence makes it trickier (behaviour should be the same across platforms) - maybe the size_t type could trap on overflow? (basically, a variant of OoM, except it's the address space) - a size type also makes sense for constraining the length to up to the system size, even if the "logical" maximum is larger (e.g. if the application allows up to e.g. 2^40 but is compiled on a 32 bit platform. - BUT with an unknown range of size types, it beomes inconvenient, because size can't be converted to int without an assert/if. - have a "large-array" type? which could optionally be compressed and/or "windowed memory mapped"? - a size_t type could also serve as documentation */ case T_KW_long: tr->kind = TR_INT; /* TODO set range */ goto done; TOKEN_CASES_QUALIFIERS { struct QualifierInfo info = qualinfos[tok - FIRST_QUALIFIER]; if (qualifier_position >= info.position) { fprintf(stderr, "qp=%d, ip=%d\n", qualifier_position, info.position); error(info.position == qualifier_position ? "Conflicting type qualifier" : "Wrong order of type qualifiers"); } else if (info.position == 4 && (quals&Q_WRAPPING) != 0) { error("wrapping integers are implicitly unsigned"); } qualifier_position = info.position; quals |= info.qual; break; } default: error("Expected a type here"); } } done: if (tr->kind == TR_BOOL) { if ((quals & (unsigned)~Q_VAR) != 0) { error("bool cannot have any qualifiers other than `var`"); } } else if (tr->kind == TR_CLASS) { /* TODO also arrays etc. */ if ((quals & (Q_WRAPPING|Q_SIGNED|Q_UNSIGNED)) != 0) { error("wrapping/signed/unsigned are not applicable for objects"); } } else { if ((quals & (Q_ALIASED|Q_VOLATILE)) != 0) { error("aliased/volatile are not applicable for int/bool/long"); } } if ((quals & Q_WRAPPING) != 0) { quals |= Q_UNSIGNED; } tr->quals = quals; return tr; } struct Var *parse_var(struct TreeNode **root, enum VarType vartype, struct Var **list_out) { struct LexemeInfo li; struct TreeNode *insresult; struct Var *var = malloc(sizeof(struct Var)); NO_NULL(var); /* Parse type */ var->typeref = parse_type_usage(); var->next = NULL; var->initval = NULL; /* Parse identifier */ expect(&li, T_LowerIdent, "Expected an identifier (lowercase ident)"); insresult = tree_insert_str( root, li.string, li.len, &var->ident.node, sizeof(struct Var)); if (!insresult->is_new) { error("Identifier of variable already in use"); } assert(insresult == &var->ident.node); insresult->is_defined = true; if (vartype == VAR_ALLOW_INITVAL) { enum Token t = tokenize(&li); if (t == T_SYM_SingleEqual) { var->is_initially_final = 1; } else if (t == T_SYM_LArrow) { var->is_initially_final = 0; } else { unread_token(); goto end_of_initval; } var->initval = parse_expr(); } end_of_initval: assert(*list_out == NULL); *list_out = var; return var; }