/* * Functions for checking API soundness / compatibility of modules * * Copyright © 2026 Samuel Lidén Borell * * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later */ #include #include #include "compiler.h" #include "semchk.h" void compare_vardef(const struct Var *expv, const struct Var *implv) { size_t namelen; assert(implv != NULL); if (!expv) { error_ident("Paramter in implementation doesn't exist in interface", &implv->ident); } namelen = implv->ident.node.length; if (expv->ident.node.length != namelen || memcmp(expv->ident.node.name, implv->ident.node.name, namelen)) { error_ident("Parameter does not match name in interface", &expv->ident); } check_type_compat(expv->typeref, implv->typeref, TC_EXACT); } static int identcmp(const struct Ident *a, const struct Ident *b) { size_t alen = a->node.length, blen = b->node.length; size_t commonlen = alen < blen ? alen : blen; if (commonlen) { int diff = memcmp(a->node.name, b->node.name, commonlen); if (diff) return diff; } assert(alen != blen); /* duplicate identifiers are not allowed */ return alen ? 1 : -1; } /* Versioned symbols must come in alphabetical order to have a normalised (and deterministic across implementations) order of symbols for computation of the API hash. add_versioned_type and add_versioned_func are responsible for keeping a sorted list types/functions for each API version. XXX The functions use insertion sort, and will be slow if there's a lot of symbols in a given API version. */ void add_versioned_type(struct VersionDecl *ver) { struct Type **inspoint = &ver->types; const struct Ident *ident = ¤t_type->ident; for (;;) { struct Type *t = *inspoint; if (!t || identcmp(&t->ident, ident) > 0) { current_type->sincever_next = t; *inspoint = current_type; return; } inspoint = &t->sincever_next; } } static int funccmp(const struct Func *a, const struct Func *b) { if (a->class_ != b->class_) { if (!a->class_) return -1; else if (!b->class_) return 1; return identcmp(&a->class_->ident, &b->class_->ident); } return identcmp(&a->ident, &b->ident); } void add_versioned_func(struct VersionDecl *ver) { struct Func **inspoint = &ver->funcs; assert(current_func != NULL); for (;;) { struct Func *f = *inspoint; if (!f || funccmp(f, current_func) > 0) { current_func->sincever_next = f; *inspoint = current_func; return; } inspoint = &f->sincever_next; } } static int varcmp(const struct VarInVersion *a, const struct Var *b, const struct Type *b_class) { if (a->class_ != b_class) { if (!a->class_) return -1; else if (!b_class) return 1; return identcmp(&a->class_->ident, &b_class->ident); } return identcmp(&a->var->ident, &b->ident); } void add_versioned_instancevar(struct VersionDecl *ver, const struct Var *instvar) { struct VarInVersion **inspoint = &ver->instvars; assert(instvar != NULL); assert(current_type != NULL); for (;;) { struct VarInVersion *vv = *inspoint; if (!vv || varcmp(vv, instvar, current_type) > 0) { struct VarInVersion *newvar = malloc(sizeof(struct VarInVersion)); NO_NULL(newvar); newvar->var = instvar; newvar->sincever_next = vv; *inspoint = newvar; return; } inspoint = &vv->sincever_next; } } /* TODO decide on a hash function to use */ /* TODO use the "key" parameter or not? is it secure to use with a publicly-known key? also, it is not defined by RFC-7693, only reserved :( */ struct SomeHashState { char dummy; }; static void somehash_init(struct SomeHashState *state, size_t outlen, const unsigned char *key, size_t keylen) { (void)state; (void)outlen; (void)key; (void)keylen; } static void somehash_update(struct SomeHashState *state, const unsigned char *data, size_t datalen) { (void)state; (void)data; (void)datalen; } static void somehash_final(struct SomeHashState *state, unsigned char *out) { (void)state; (void)out; } enum RecordKind { RECORDKIND_END = 0, RECORDKIND_TYPE = 1, RECORDKIND_FUNC, RECORDKIND_VAR, RECORDKIND_INSTANCEVAR, RECORDKIND_PRECEEDING_VERSION }; static void feed_byte(struct SomeHashState *state, unsigned char b) { somehash_update(state, &b, 1); } static void feed_string(struct SomeHashState *state, const char *s, size_t len) { if (len) { somehash_update(state, (const unsigned char *)s, len); } feed_byte(state, 0); } static void feed_ident(struct SomeHashState *state, const struct Ident *ident) { feed_string(state, ident->node.name, ident->node.length); } static void feed_classref(struct SomeHashState *state, const struct Type *class_) { /* TODO optimise: emit some special value (e.g. 1) when the class name is repeated. - perhaps track the last outputted (defined or referenced) class name? */ if (class_) { assert(class_->ident.node.length != 0); feed_ident(state, &class_->ident); } else { feed_byte(state, 0); } } static void feed_typeref(struct SomeHashState *state, const struct TypeRef *tr) { unsigned char quals = 0x00; if ((tr->quals & Q_VAR) != 0) quals |= 0x01; /* TODO "io" qualifier? */ if ((tr->quals & Q_ALIASED) != 0) quals |= 0x04; if ((tr->quals & Q_VOLATILE) != 0) quals |= 0x08; /* TODO should these be qualifiers or separate types? */ if ((tr->quals & Q_SIGNED) != 0) quals |= 0x10; if ((tr->quals & Q_UNSIGNED) != 0) quals |= 0x20; if ((tr->quals & Q_WRAPPING) != 0) quals |= 0x40; feed_byte(state, quals); switch (tr->kind) { case TR_CLASS: { const struct Type *t = tr->u.class_; assert(t != NULL); /* TODO external identifiers: - kind=2 (instead of 1) - API hash etc. */ feed_byte(state, 1); feed_ident(state, &t->ident); break; } case TR_BOOL: feed_byte(state, 3); break; case TR_INT: /* TODO integer range and/or different integer types */ feed_byte(state, 4); break; case TR_UNKNOWN: case TR_VOID: default: assert(0); break; } } static void feed_var(struct SomeHashState *state, const struct Var *v) { unsigned char varkind = 0x00; assert(v->ident.node.length != 0); feed_ident(state, &v->ident); if (v->is_modifiable) varkind |= 0x01; /* TODO optional etc. */ feed_byte(state, varkind); feed_typeref(state, v->typeref); } static void feed_varlist(struct SomeHashState *state, const struct Var *vars, size_t count) { const struct Var *v = vars; size_t remaining = count; while (remaining--) { assert(v != NULL); feed_byte(state, RECORDKIND_VAR); feed_var(state, v); v = v->next; } feed_byte(state, RECORDKIND_END); } #define APIHASH_SIZE 32 static void compute_api_hash(struct VersionDecl *ver) { struct SomeHashState state; struct Type *t; struct Func *f; struct VarInVersion *v; /*somehash_init(&state, 32, ver->preceeding ? ver->preceeding->apihash : NULL, 32);*/ somehash_init(&state, 32, NULL, 0); feed_string(&state, "SLUL API definition to be hashed", 32); feed_byte(&state, 0); /* version */ feed_string(&state, ver->node.name, ver->node.length); for (t = ver->types; t; t = t->sincever_next) { feed_byte(&state, RECORDKIND_TYPE); feed_ident(&state, &t->ident); /* TODO type parameters */ feed_byte(&state, 0); /* TODO flags such as "closed" */ } feed_byte(&state, RECORDKIND_END); for (f = ver->funcs; f; f = f->sincever_next) { unsigned char funckind; feed_byte(&state, RECORDKIND_FUNC); feed_classref(&state, f->class_); feed_ident(&state, &f->ident); assert(!f->is_entry); assert(!f->is_service_ctor); funckind = 0x00; if (f->is_modifying) funckind |= 0x01; if (f->is_constructor) funckind |= 0x02; if (f->is_noreturn) funckind |= 0x04; feed_byte(&state, funckind); feed_varlist(&state, f->params, f->num_params); feed_varlist(&state, f->returns, f->num_returns); /* XXX Perhaps add extensible functions? I.e. where more params can be added in future versions. Can it be implemented efficiently across all platforms? Is it a good idea? */ } feed_byte(&state, RECORDKIND_END); for (v = ver->instvars; v; v = v->sincever_next) { feed_byte(&state, RECORDKIND_INSTANCEVAR); feed_classref(&state, v->class_); feed_var(&state, v->var); } feed_byte(&state, RECORDKIND_END); /* TODO enum values */ /* The API hash depends on the previous hashes, so it is added last to allow for parallel implementations. */ feed_byte(&state, RECORDKIND_PRECEEDING_VERSION); feed_byte(&state, ver->preceeding != NULL ? 1 : 0); if (ver->preceeding) { somehash_update(&state, ver->preceeding->apihash, APIHASH_SIZE); } somehash_final(&state, ver->apihash); } void compute_api_hashes(void) { struct VersionDecl *ver = mod_declared_versions_list; for (; ver; ver = ver->next) { compute_api_hash(ver); } } /* TODO api hash computation. pre-requisites: 1. process exported symbols in this order: a. version order. b. then kind of symbol (type vs top-level function vs constant/variable) c. then alphabetical order. 2. blake3, in a portable way (ANSI C). - portable SlulInt instead of uint32_t, with masks to strip high bits if > 32 bits (test this with 64 bits!) - portable unsigned char instead of uint_8, with masks to strip high bits if > 8 bits (test this with 16 bits or more). 2 (alternative option:) use blake2s, since the input is probably small (make a guesstimate) and the complexity / I-cache usage / etc. might not be worth the possible speedup of blake3. - and it has a (finalised) RFC: RFC-7693 regarding blake2s: - could use the the following as the key: previous API-hash (or all zeros) - NOTE: the key is reserved but NOT defined in RFC-7693! - there's an IETF RFC, but it's not a NIST standard. also, the RFC is "informational". could this be a problem? - sometimes blake3 and/or blake2b is supported but not blake2s (for example, in the CycloneDX spec). are people moving away from blake2s? - but on the other hand, CycloneDX has only "blake3" which i think is the 512-bit version :( but it has several output lengths for blake2b regarding SHA-256: - it is perhaps considered "more standard" than blake2s, even though the latter has an RFC. - it is apparently often significantly faster than blake3/2b/2s due to being supported in hardware in many processors. (but on modern x86_64 there seems to be little difference, on 64 .. 4096 byte datasets at least) - however, is it as secure? compare security levels of SHA-256 vs blake2s-256 - a more future-proof hash is preferred since it will be very difficult to change. regarding SHA-512/256 (truncated SHA-512) - this avoids length-extension attacks - 64-bit word size - standardized and should be future-proof security wise. - is is NOT simply truncated. It uses a different initialisation! - slow. how many % of the compilation time is used on hashing interfaces? - how complex is the code? regarding SHA-3 / SHAKE-256: - SHAKE-256 is somehow based on Keccak (which SHA-3 is based on as well) - looks like SHAKE-256 is the fixed-size output version, while SHA-3 is the variable-sized output version. - SHAKE-256 appears to be slower than blake2 (not sure if b or s or both) - there's a KangarooTwelwe function that is faster, for smaller inputs - I assume that this means that SHA-3 is slow on small inputs? (and API-hash-preimages will often be small) regarding the hash pre-image / "to-be-hashed-data": - all items in lists must start with non-zero byte! (since zero is used as a terminator byte) - (non-closed) classes CAN be repeated to extend them with more fields. same for enums. */