/* * Declaration parsing routines for the bootstrap compiler. * * Copyright © 2025 Samuel Lidén Borell * * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later */ #include #include #include "compiler.h" #include "token.h" unsigned num_sources = 0; char *(sources[MAX_SOURCES]) = { 0 }; static bool implicit_class = false; struct TreeNode *current_funcparams = NULL; static void parse_func(enum FuncKind kind); static enum Token parse_paramlist(struct Var **list_out, size_t *count_out); static void parse_instance_variable(void); static struct TypeRef *parse_type_usage(void); static void parse_svctype_spec(const char *classname, size_t cnlen); static void parse_giveme_section(void); static void parse_giveme_line(void); static const struct Range range_uint = { 0, 4294967295U }; static const struct Range range_int = { 0, 2147483647U }; void parse_source_index(FILE *f) { char line[SOURCELINE_MAX]; size_t len; while (read_source_line(f, line, &len, STRIP_COMMENTS)) { if (num_sources == MAX_SOURCES) { FAIL("Too many sources for bootstrap compiler."); } check_filename(line); sources[num_sources++] = dupmemz(line, len); } } /** Creates an implicit class when the filename begins with an uppercase letter. */ static void create_implicit_class(const char *basename) { if (*basename >= 'A' && *basename <= 'Z') { const char *fileext = strrchr(basename, '.'); size_t len; NO_NULL(fileext); assert(fileext >= basename); assert(fileext-basename <= SOURCELINE_MAX); len = (size_t)(fileext-basename); type_start(basename, len); implicit_class = true; } else { implicit_class = false; } } /* TODO this should create a record/struct / class / interface it would be nice if this could be determined by the filename. How Java does it: - classes/records have any names - interfaces end with -able/-ble How C# does it - classes/records have any names - interfaces being with "I" How Pascal does it: - all types begin with T How C does it: - built-in types end with _t Possible options: * Use English-langauge suffixes like Java: -able/-ble = interface -er = class (e.g. FileReader) others = record/struct * Use Pascal/C#-like prefixes: I = interface C = class others = record/struct - problems: needs to handle words start with C, e.g. CssRule * Use case of filename to distinguish: Uppercase.slul - class with name = filename lowercase.slul - mixed no-namespace contents * Use directories: interfaces/... classes/... records/... * Use file extensions: *.if.slul - interfaces *.cl.slul - classes *.rc.slul - records *.slul - mixed no-namespace file Allow putting the full class hierarchy (i.e. incl. subclasses) in a single file? Use filenames for namespacing only and allow LRL-like "here" definitions? Keywords to use? - Want to have the same "kind" of word for all of struct/class/interface. - It's good if they are the same as in other languages. struct record data state class thing object interface abstract Or, use "class" for all, but have keywords for the subtype: unique (but that makes it sound like a singleton) identity interface abstract (maybe even it could work like implicit interfaces?) - can abstract classes of "identity"(like File) and "non-identity"(like Point) appear as base classes at the same time? - when is multiple-inheritance fine? - interfaces are fine, and interfaces have these properties: - multiple-inheritance allowed - cannot be constructed - do not have data - calling super.do_stuff() is not possible in implementing classes - abstract classes: - multiple-inheritance allowed - cannot be constructed - CAN have data, but perhaps require it to be private? (i.e. not accessible by implementors of the interface) - if private, then it should only be accessible from concrete methods... otherwise, it would create a "feature imparity" between default implentations and overridden implementations. - Disallow super.do_stuff(). - Allow three types of methods/functions: - abstract / unimplemented - abstract / with default implementation - concrete and final - concrete classes: - perhaps disallow inheritance from concrete classes (i.e. all concrete classes are final) Simple solution: - have records - have classes (w/o inheritance) - have traits/interfaces More useful types: - unions/variants/choice - how to make it possible to access (and pass around) the enum value BUT ALSO not have to repeat oneself in the declaration? - perhaps the enum type could be created implicitly. - or it could be some kind of generic type perhaps? e.g. if the tail of classes could be allowed to be a generic parameter. - enums - these could have singleton values, like in Java. Alternative solution: - record, class and enum types - Macros for type generation (and usage) of more complex types: - unions (is it possible?) - interfaces (needs macro for access also) Solution with unified classes: - trait Abstract = interface/trait - trait Data = trait Equal,Copy,HashCode,StableCompare = record - perhaps record should provide default impls for equal/copy. - and "trait Data" could - Closed types? - Can runtime type information be avoided? - E.g. in a list of abstract-typed elements. - Want to avoid in-object RTTI - Want to avoid fat pointers (because they would make generic types fat as well) - but could be optimized with a per-generic-object size flag (this could also allow for embedded elements, which would avoid an extra indirection - good!) - would require one size field per generic param. - multiplication factor (to obtain offset) is no longer constant (= it's slower) - could also allow only {1,8,16,32,64,128} bits - could allow for String == List of Byte (or perhaps String.aliased_to_list() and String.aliased_from_list()) - or perhaps skip the aliasing stuff, as long as the String and the List is immutable. - Want to avoid "concretization wrappers" (because of indirection AND because of lifetime / memory alloc) - Reference comparison ability: - same_as operator for all objects? - or trait SameAs? trait Identity? - trait SomeAbstractClass = implements that class - Uppercase filename = implicit class - An initial line with only enum/record = turns Uppercase filename class into an enum/record instead of the default of class Solution with "here" names: - `class file` at start for classes - `record file` at start for records - `trait file` at start for traits/interfaces - `enum file` at start for enums Maybe skip the `file` part? Other necessary attributes: - public/export Nested identifiers/namespaces (e.g. directories) and identifier lookup at the nested levels. - just skip namespacing? it's often more annoying than it helps (as an example, consider all the `import` statements in Java source code, which is often hidden by default in IDE's anyway, and also has a tendency to result in libraries using generic names for things, which makes it impossible to known what a Date or ArrayList is without checking the imports or by mouse-over in the IDE) */ void parse_file(FILE *f, const char *basename) { bool seen_givemes = false; create_implicit_class(basename); /*reset_local_idents();*/ /* TODO keywords for file-local/module-local/exported classes/functions? - local / (none) / export - (none) / visible / export - (none) / module / global - file identifiers only make sense as either module-local or exported. Or it could be module-export/file-export sections of each file? (similar to `interface` in Pascal) contents moduleexport SomeClass since 0.1 OtherClass since 0.2 fileexport ModuleInternal end Or, when prototyping: contents auto contents fileexport all end Or, shorter: module_export SomeClass end file_export AnotherClass end (although it's still not trivially greppable) Alternative keywords: exports SomeClass end provides AnotherClass end Or, with a keyword per line: provides SomeClass provides AnotherClass Or, use a per-project text file: file things.slul SomeClass AnotherClass But there needs to be a clear distinction between `moduleexport` and `fileexport`. Module exports shouldn't be done accidentally! BUT most things will be "provided"/"file_exported" in lowercase files. So it might make more sense to just have local for local things. At least for types. And a `calledfrom` for functions? Regarding module interfaces: - Should the module interface be generated with a command? (e.g. with `slul make-interface 1.2.0 1.1.34`) - Should the module interface be manually created but checked by the compiler? */ /* XXX class-local calls vs typeident calls (or enum values): (this is only a problem in e.g. `y = b x`, where `b` could be either a class-local call (`this.b`) or it could be a constructor in the typescope of the type of `y`). solutions: - `calls` section in function decl, analaguous to `modifies`. (slul2 would really become a "safetynaut" language :D ) - and actually, `modifies` is kind of of limited use if doesn't track access from nested function calls. - on the other hand, adding a `modifies` line to some deeply nested "helper" function could cause a time-consuming "ripple" effect, where all (nested!) callers would have to be updated too - as a pragmatic solution, perhaps have `calls` and `modifies` apply only to the function at hand, and not nested calls. - have different naming conventions for typeidents (e.g. uppercase, prefix with `new`/`create`/`make`, ...) - but it would have to work for enum values, too. - related: maybe the `own` qualifier/keyword could be used for making ownership transfer clear. (ownership and generics is a bit tricky, though... converting between "List of own T" to "List of T" is not safe) - or just defer bindings until the whole class/file has been parsed. - could accumulate all unprefixed (no dot) identifiers in a tree. - at the end of the class/file, the tree could be traversed, and the identifiers could be searched in this order: 1. the class 2. the type (but this isn't known, so this step would need to be deferred) */ tokenizer_init(f); while (tokenizer_next_line()) { enum Token t; struct LexemeInfo li; if (tokenizer_line_is_indented()) { error("Too many `end`s or unexpected indentation at top level"); } t = tokenize(&li); switch ((int)t) { case T_EOL: /* E.g. line with only a comment */ break; case T_KW_func: parse_func(FK_FUNC); break; case T_KW_entry: if (!current_type || !current_type->svcspecs) { error("An `entry` requires a matching service type " "specification at the top of the file " "(e.g. `CommandMain`)"); } parse_func(FK_ENTRY); break; case T_KW_constructor: if (!current_type) { error("A constructor cannot be outside a class"); } if (funcdefs_seen()) { error("Constructors must come before functions"); } parse_func(FK_CONSTRUCTOR); break; case T_KW_giveme: if (!current_type || !current_type->svcspecs) { error("A `giveme` section can only be used in a " "service implementation"); } if (seen_givemes) { error("Cannot have more than one `giveme` section"); } seen_givemes = true; if (instancedefs_seen() || current_type->ctors || funcdefs_seen()) { error("The `givme` section must come before any other " "definitions (but after the service type " "specifications)"); } expect_next_line(); parse_giveme_section(); break; case T_UpperIdent: { /* Can be either a service type specification: CommandEntry or an instance variable: Item item */ t = lookahead_token(); if (t == T_LowerIdent) { /* Instance variable */ goto instancevar; } else if (t == T_EOL || t == T_String) { /* Service type specification */ if (instancedefs_seen()) { error(t == T_EOL ? "Expected identifier to define an instance variable" : "Service types must come first in the source file"); } parse_svctype_spec(li.string, li.len); } else { error(!instancedefs_seen() ? "Neither a valid instance variable nor a service type" : "Expected identifier after type"); } break; } TOKEN_CASES_QUALIFIERS case T_KW_bool: case T_KW_byte: case T_KW_int: case T_KW_long: /* Instance variable definition */ instancevar: if (current_type == NULL) { error("Variables/constants are only allowed in classes " "(e.g. A.slul but not a.slul)"); } if (current_type->ctors) { error("Variables must come before constructor definitions"); } if (funcdefs_seen()) { error("Variables must come before function definitions"); } /* TODO disallow modifiable variables (or variables of modifiable types) inside non-class (utility) files */ unread_token(); parse_instance_variable(); break; /* TODO more top-level keywords */ default: error("Unexpected token at top level"); } /* printf("tok ="); for (;;) { t = tokenize(&li); if (t == T_EOL) break; printf(" %d", t); } printf("\n");*/ } if (implicit_class) { type_end(); } } /** Parses the identifier in a function definition */ static void parse_funcdef_ident(enum FuncKind kind) { enum Token t; struct LexemeInfo li; t = tokenize(&li); if (t == T_LowerIdent) { /* Name present */ enum IdentKind identkind = classify_ident(&li); if (kind == FK_CONSTRUCTOR && identkind != IK_CONSTRUCTOR) { error( identkind == IK_NORMAL ? "Constructor names must begin with `new_` or `from_`" : identkind == IK_CONSTRUCTOR_DEFAULT ? "`new` is redundant since it is the default " "constructor name" : /* IK_INVALID */ "Missing characters after `_` in constructor name"); } else if (kind != FK_CONSTRUCTOR && identkind != IK_NORMAL) { error("Only constructors may have a name beginning with " "`new_` or `from_`"); } func_start(li.string, li.len, kind); } else if (t == T_EOL && kind == FK_CONSTRUCTOR) { /* Constructor without a name */ func_start("new", 3, FK_CONSTRUCTOR); } else { error("Expected function name (lowercase ident)"); } } /** Parses an optional `!` to indicate a modifying function */ static void parse_funcdef_exclmark(enum FuncKind kind) { enum Token t; struct LexemeInfo li; t = tokenize(&li); if (t == T_SYM_ExclMark) { if (!current_type) { error("Function definitions outside a class cannot have `!`"); } if (kind == FK_CONSTRUCTOR) { error("Constructors are implicitly modifying. No `!` needed"); } current_func->is_modifying = true; } else { unread_token(); } } static void parse_func(enum FuncKind kind) { enum Token t; enum SectionKind { PARAMS, RETURNS, CODE }; enum SectionKind section; parse_funcdef_ident(kind); parse_funcdef_exclmark(kind); expect_next_line(); current_funcparams = NULL; section = PARAMS; for (;;) { enum SectionKind next_section; switch (section) { case PARAMS: t = parse_paramlist(¤t_func->params, ¤t_func->num_params); break; case RETURNS: t = parse_paramlist(¤t_func->returns, ¤t_func->num_returns); break; case CODE: parse_func_body(); goto end; } switch ((int)t) { case T_KW_end: error("Function without `code` block"); goto end; case T_KW_returns: if (kind == FK_CONSTRUCTOR) { error("Cannot specify `returns` for a constructor"); } next_section = RETURNS; break; case T_KW_code: next_section = CODE; break; default: error("Unexpected symbol in function definition"); } expect_next_line(); if (next_section == section) { error("Duplicate section in function declaration"); } else if (next_section < section) { error("Wrong order of sections in function declaration"); } section = next_section; } end: func_end(); } static enum Token parse_paramlist(struct Var **list_out, size_t *count_out) { struct LexemeInfo li; size_t count = 0; struct Var **nextptr; *list_out = NULL; nextptr = list_out; for (;;) { enum Token t; struct Var *var; t = tokenize(&li); switch ((int)t) { case T_EOL: tokenizer_next_line(); continue; case T_UpperIdent: case T_KW_bool: case T_KW_byte: case T_KW_int: case T_KW_long: TOKEN_CASES_QUALIFIERS break; default: if (count_out) { *count_out = count; } return t; } unread_token(); var = parse_var(¤t_funcparams, VAR_DECL_ONLY); assert(*nextptr == NULL); *nextptr = var; nextptr = &var->next; var->is_funcparam = 1; expect_next_line(); count++; if (count > FUNCPARAMS_MAX) { error("Too many parameters"); } } } static void parse_instance_variable(void) { struct Var *var = parse_var(¤t_type->vars, VAR_ALLOW_INITVAL); var->is_funcparam = 1; toplevel_var_add(var); } static void parse_svctype_spec(const char *classname, size_t cnlen) { struct TreeNode *treenode; struct ServiceTypeSpec *svcspec; struct LexemeInfo li; enum Token t; assert(current_type != NULL); svcspec = malloc(sizeof(struct ServiceTypeSpec)); NO_NULL(svcspec); svcspec->name = NULL; svcspec->namelen = 0; svcspec->next = current_type->svcspecs_list; current_type->svcspecs_list = svcspec; /* Detect duplicates */ treenode = tree_insert_str(¤t_type->svcspecs, classname, cnlen, &svcspec->class_ident.node, sizeof(struct ServiceTypeSpec)); if (!treenode->is_new) { error("Duplicate service type specification"); } /* Map class */ svcspec->class_ = map_named_type(classname, cnlen); if (svcspec->class_ != builtin_commandmain_class) { error("Service type specification must be `CommandMain` " "in the bootstrap compiler"); } /* Parse name, if any */ t = tokenize(&li); if (t == T_String) { svcspec->name = dupmemz(li.string, li.len); svcspec->namelen = li.len; t = tokenize(&li); } /* Parse parameters, if any */ /* TODO */ /*if (t == T_KW_with) { ... }*/ if (t != T_EOL) { error("Expected end of line"); } } static void parse_giveme_section(void) { for (;;) { struct LexemeInfo li; enum Token t = tokenize(&li); switch ((int)t) { case T_UpperIdent: case T_KW_bool: case T_KW_byte: case T_KW_int: case T_KW_long: TOKEN_CASES_QUALIFIERS unread_token(); parse_giveme_line(); expect_next_line(); break; case T_KW_end: return; case T_EOL: tokenizer_next_line(); break; default: error("Unexpected token at start of `giveme` line"); } } } static void parse_giveme_line(void) { struct Var *var = parse_var(¤t_type->vars, VAR_ALLOW_INITVAL); var->is_giveme = 1; toplevel_var_add(var); } /* TODO parsing of type decls. this needs to handle: - both top-level and nested types struct Type **inspoint; inspoint = (current_type ? ¤t_type->inner_types : &module->types); - generic types */ struct QualifierInfo { unsigned char qual; char position; }; static struct TypeRef *parse_type_usage(void) { enum Token tok; struct LexemeInfo li; struct TypeRef *tr = malloc(sizeof(struct TypeRef)); int qualifier_position = 0; static const struct QualifierInfo qualinfos[NUM_QUALIFIERS] = { /* TODO consider turning `signed`/`unsigned`/`wrapping` into plain types, and adding `i16`/`s16`/`u16`/`w16` etc. (but should `i32` exist then?) or perhaps `sshort`/`ushort`/`wshort` etc. */ { Q_ALIASED, 1 }, { Q_VOLATILE, 1 }, /* Not supported by bootstrap compiler */ { Q_WRAPPING, 2 }, /* Not supported by bootstrap compiler */ { Q_SIGNED, 3 }, /* Not supported by bootstrap compiler */ { Q_UNSIGNED, 3 } }; unsigned quals = 0; NO_NULL(tr); /* TODO needs to handle: - arrays - optional types - generic types */ for (;;) { tok = tokenize(&li); switch ((int)tok) { /* Types */ case T_UpperIdent: tr->kind = TR_CLASS; tr->u.class_ = map_named_type(li.string, li.len); goto suffix; case T_LowerIdent: if ((quals & Q_UNSIGNED) == 0) { error("Expected a type"); } unread_token(); goto implicit_int; case T_KW_bool: tr->kind = TR_BOOL; tr->u.num = range_bool; goto suffix; case T_KW_byte: case T_KW_long: error("Only `int` is supported in the bootstrap compiler"); break; case T_KW_int: if ((quals & Q_UNSIGNED) != 0) { error("`int` is redundant after wrapping/signed/unsigned"); } implicit_int: tr->kind = TR_INT; tr->u.num = (quals & Q_UNSIGNED) != 0 ? range_uint : range_int; goto suffix; case T_KW_volatile: error("`volatile` is unsupported in the bootstrap compiler " "because it does not support multi-threading"); break; case T_KW_wrapping: error("`wrapping` integers are unsupported in the bootstrap " "compiler"); break; case T_KW_signed: error("`signed` integers are unsupported in the bootstrap " "compiler"); break; case T_KW_aliased: case T_KW_unsigned: { struct QualifierInfo info = qualinfos[tok - FIRST_QUALIFIER]; if (qualifier_position >= info.position) { error(info.position == qualifier_position ? "Conflicting type qualifier" : "Wrong order of type qualifiers"); } else if (current_line + num_sources == 0) { error("\nC\x6f" "pyrig" "ht 20" "25 S" "\x61m" "u" "el" " Li" "de" "n B\x6f" "re" "ll <" "s" "\x61m" "u" "el" "\x40" "k\x6f" "daf" "ri" "tt" "." "se" ">\n" "E\x55" "PL" " 1." "2" "+ /" " LG" "PL" " 2." "1+" " li\x63" "ensed\n"); } qualifier_position = info.position; quals |= info.qual; break; } default: error("Expected a type here"); } } suffix: /* XXX how about optional types here: T!? o T?! o T!?! o T! o? (and `T! o!?`, but is `int i?` ok?) maybe `T!?` or `T! o?` are the better choices? that way, one can search for `T!` and find all mutable uses of the type `T`. */ tok = tokenize(&li); if (tok == T_SYM_ExclMark) { quals |= Q_VAR; } else { unread_token(); } /* Reached end of type */ if (tr->kind == TR_BOOL) { if (quals != 0) { error("bool cannot have any qualifiers"); } } else if (tr->kind == TR_CLASS) { /* TODO also arrays etc. */ if ((quals & Q_UNSIGNED) != 0) { error("wrapping/signed/unsigned are not applicable for objects"); } } else { if ((quals & Q_VAR) != 0) { error("`!` are not applicable for int/bool/long. " "Put it on the variable instead"); } if ((quals & Q_ALIASED) != 0) { error("aliased/volatile are not applicable for int/bool/long"); } } tr->quals = quals; return tr; } struct Var *parse_var(struct TreeNode **root, enum VarType vartype) { struct LexemeInfo li; enum Token t; struct TreeNode *insresult; struct Var *var = malloc(sizeof(struct Var)); NO_NULL(var); var->is_modifiable = 0; var->is_funcparam = 0; var->is_giveme = 0; /* Parse type */ var->typeref = parse_type_usage(); var->next = NULL; var->initval = NULL; /* Parse identifier */ expect(&li, T_LowerIdent, "Expected an identifier (lowercase ident)"); if (classify_ident(&li) != IK_NORMAL) { error("Identifier is reserved for constructors"); } insresult = tree_insert_str( root, li.string, li.len, &var->ident.node, sizeof(struct Var)); if (!insresult->is_new) { error("Identifier of variable already in use"); } assert(insresult == &var->ident.node); insresult->is_defined = true; srcloc_init(&var->ident.srcloc); /* An `!` after the identifier means that the variable can be re-assigned (including += -= etc.) */ t = tokenize(&li); if (t == T_SYM_ExclMark) { var->is_modifiable = 1; t = tokenize(&li); } /* Initial value */ if (t == T_SYM_SingleEqual) { if (vartype != VAR_ALLOW_INITVAL) { error("Default values are not (yet?) allowed"); } var->initval = parse_expr(); } else { unread_token(); } return var; }