bootstrap/apichk.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423


/*
 * Functions for checking API soundness / compatibility of modules
 *
 * Copyright © 2026 Samuel Lidén Borell <samuel@kodafritt.se>
 *
 * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later
 */
#include <assert.h>
#include <string.h>
#include "compiler.h"
#include "semchk.h"

void compare_vardef(const struct Var *expv, const struct Var *implv)
{
    size_t namelen;
    assert(implv != NULL);
    if (!expv) {
        error_ident("Paramter in implementation doesn't exist in interface",
                   &implv->ident);
    }

    namelen = implv->ident.node.length;
    if (expv->ident.node.length != namelen ||
            memcmp(expv->ident.node.name, implv->ident.node.name, namelen)) {
        error_ident("Parameter does not match name in interface",
                    &expv->ident);
    }

    check_type_compat(expv->typeref, implv->typeref, TC_EXACT);
}

static int identcmp(const struct Ident *a, const struct Ident *b)
{
    size_t alen = a->node.length, blen = b->node.length;
    size_t commonlen = alen < blen ? alen : blen;
    if (commonlen) {
        int diff = memcmp(a->node.name, b->node.name, commonlen);
        if (diff) return diff;
    }
    assert(alen != blen); /* duplicate identifiers are not allowed */
    return alen ? 1 : -1;
}

/*
   Versioned symbols must come in alphabetical order to have a normalised
   (and deterministic across implementations) order of symbols for computation
   of the API hash.

   add_versioned_type and add_versioned_func are responsible for keeping
   a sorted list types/functions for each API version.

   XXX The functions use insertion sort, and will be slow if there's a
   lot of symbols in a given API version.
*/

void add_versioned_type(struct VersionDecl *ver)
{
    struct Type **inspoint = &ver->types;
    const struct Ident *ident = &current_type->ident;

    for (;;) {
        struct Type *t = *inspoint;

        if (!t || identcmp(&t->ident, ident) > 0) {
            current_type->sincever_next = t;
            *inspoint = current_type;
            return;
        }
        inspoint = &t->sincever_next;
    }
}

static int funccmp(const struct Func *a, const struct Func *b)
{
    if (a->class_ != b->class_) {
        if (!a->class_) return -1;
        else if (!b->class_) return 1;
        return identcmp(&a->class_->ident, &b->class_->ident);
    }
    return identcmp(&a->ident, &b->ident);
}

void add_versioned_func(struct VersionDecl *ver)
{
    struct Func **inspoint = &ver->funcs;

    assert(current_func != NULL);
    for (;;) {
        struct Func *f = *inspoint;

        if (!f || funccmp(f, current_func) > 0) {
            current_func->sincever_next = f;
            *inspoint = current_func;
            return;
        }
        inspoint = &f->sincever_next;
    }
}

static int varcmp(const struct VarInVersion *a,
                  const struct Var *b,
                  const struct Type *b_class)
{
    if (a->class_ != b_class) {
        if (!a->class_) return -1;
        else if (!b_class) return 1;
        return identcmp(&a->class_->ident, &b_class->ident);
    }
    return identcmp(&a->var->ident, &b->ident);
}

void add_versioned_instancevar(struct VersionDecl *ver,
                               const struct Var *instvar)
{
    struct VarInVersion **inspoint = &ver->instvars;

    assert(instvar != NULL);
    assert(current_type != NULL);
    for (;;) {
        struct VarInVersion *vv = *inspoint;

        if (!vv || varcmp(vv, instvar, current_type) > 0) {
            struct VarInVersion *newvar = malloc(sizeof(struct VarInVersion));
            NO_NULL(newvar);
            newvar->var = instvar;
            newvar->sincever_next = vv;
            *inspoint = newvar;
            return;
        }
        inspoint = &vv->sincever_next;
    }
}


/* TODO decide on a hash function to use */
/* TODO use the "key" parameter or not?
   is it secure to use with a publicly-known key?
   also, it is not defined by RFC-7693, only reserved :( */
struct SomeHashState { char dummy; };
static void somehash_init(struct SomeHashState *state, size_t outlen,
                          const unsigned char *key, size_t keylen)
{
    (void)state;
    (void)outlen;
    (void)key;
    (void)keylen;
}
static void somehash_update(struct SomeHashState *state,
                            const unsigned char *data,
                            size_t datalen)
{
    (void)state;
    (void)data;
    (void)datalen;
}
static void somehash_final(struct SomeHashState *state,
                           unsigned char *out)
{
    (void)state;
    (void)out;
}

enum RecordKind {
    RECORDKIND_END = 0,
    RECORDKIND_TYPE = 1,
    RECORDKIND_FUNC,
    RECORDKIND_VAR,
    RECORDKIND_INSTANCEVAR,
    RECORDKIND_PRECEEDING_VERSION
};

static void feed_byte(struct SomeHashState *state, unsigned char b)
{
    somehash_update(state, &b, 1);
}

static void feed_string(struct SomeHashState *state, const char *s, size_t len)
{
    if (len) {
        somehash_update(state, (const unsigned char *)s, len);
    }
    feed_byte(state, 0);
}

static void feed_ident(struct SomeHashState *state, const struct Ident *ident)
{
    feed_string(state, ident->node.name, ident->node.length);
}

static void feed_classref(struct SomeHashState *state,
                          const struct Type *class_)
{
    /* TODO optimise: emit some special value (e.g. 1) when the class name
            is repeated.
            - perhaps track the last outputted (defined or referenced)
              class name? */
    if (class_) {
        assert(class_->ident.node.length != 0);
        feed_ident(state, &class_->ident);
    } else {
        feed_byte(state, 0);
    }
}

static void feed_typeref(struct SomeHashState *state, const struct TypeRef *tr)
{
    unsigned char quals = 0x00;

    if ((tr->quals & Q_VAR) != 0)      quals |= 0x01;
    /* TODO "io" qualifier? */
    if ((tr->quals & Q_ALIASED) != 0)  quals |= 0x04;
    if ((tr->quals & Q_VOLATILE) != 0) quals |= 0x08;
    /* TODO should these be qualifiers or separate types? */
    if ((tr->quals & Q_SIGNED) != 0)   quals |= 0x10;
    if ((tr->quals & Q_UNSIGNED) != 0) quals |= 0x20;
    if ((tr->quals & Q_WRAPPING) != 0) quals |= 0x40;
    feed_byte(state, quals);

    switch (tr->kind) {
    case TR_CLASS: {
        const struct Type *t = tr->u.class_;
        assert(t != NULL);
        /* TODO external identifiers:
           - kind=2 (instead of 1)
           - API hash etc. */
        feed_byte(state, 1);
        feed_ident(state, &t->ident);
        break; }
    case TR_BOOL:
        feed_byte(state, 3);
        break;
    case TR_INT:
        /* TODO integer range and/or different integer types */
        feed_byte(state, 4);
        break;
    case TR_UNKNOWN:
    case TR_VOID:
    default:
        assert(0);
        break;
    }
}

static void feed_var(struct SomeHashState *state, const struct Var *v)
{
    unsigned char varkind = 0x00;

    assert(v->ident.node.length != 0);
    feed_ident(state, &v->ident);

    if (v->is_modifiable) varkind |= 0x01;
    /* TODO optional etc. */
    feed_byte(state, varkind);

    feed_typeref(state, v->typeref);
}

static void feed_varlist(struct SomeHashState *state, const struct Var *vars,
                         size_t count)
{
    const struct Var *v = vars;
    size_t remaining = count;

    while (remaining--) {
        assert(v != NULL);
        feed_byte(state, RECORDKIND_VAR);
        feed_var(state, v);
        v = v->next;
    }
    feed_byte(state, RECORDKIND_END);
}

#define APIHASH_SIZE 32

static void compute_api_hash(struct VersionDecl *ver)
{
    struct SomeHashState state;
    struct Type *t;
    struct Func *f;
    struct VarInVersion *v;

    /*somehash_init(&state, 32,
                  ver->preceeding ? ver->preceeding->apihash : NULL, 32);*/
    somehash_init(&state, 32, NULL, 0);
    feed_string(&state, "SLUL API definition to be hashed", 32);
    feed_byte(&state, 0); /* version */

    feed_string(&state, ver->node.name, ver->node.length);

    for (t = ver->types; t; t = t->sincever_next) {
        feed_byte(&state, RECORDKIND_TYPE);
        feed_ident(&state, &t->ident);
        /* TODO type parameters */

        feed_byte(&state, 0); /* TODO flags such as "closed" */
    }
    feed_byte(&state, RECORDKIND_END);

    for (f = ver->funcs; f; f = f->sincever_next) {
        unsigned char funckind;

        feed_byte(&state, RECORDKIND_FUNC);
        feed_classref(&state, f->class_);
        feed_ident(&state, &f->ident);

        assert(!f->is_entry);
        assert(!f->is_service_ctor);
        funckind = 0x00;
        if (f->is_modifying)    funckind |= 0x01;
        if (f->is_constructor)  funckind |= 0x02;
        if (f->is_noreturn)     funckind |= 0x04;
        feed_byte(&state, funckind);

        feed_varlist(&state, f->params, f->num_params);
        feed_varlist(&state, f->returns, f->num_returns);
        /* XXX Perhaps add extensible functions? I.e. where more params can be
           added in future versions. Can it be implemented efficiently across
           all platforms? Is it a good idea? */
    }
    feed_byte(&state, RECORDKIND_END);

    for (v = ver->instvars; v; v = v->sincever_next) {
        feed_byte(&state, RECORDKIND_INSTANCEVAR);
        feed_classref(&state, v->class_);
        feed_var(&state, v->var);
    }
    feed_byte(&state, RECORDKIND_END);

    /* TODO enum values */

    /* The API hash depends on the previous hashes, so it is added last to
       allow for parallel implementations. */
    feed_byte(&state, RECORDKIND_PRECEEDING_VERSION);
    feed_byte(&state, ver->preceeding != NULL ? 1 : 0);
    if (ver->preceeding) {
        somehash_update(&state, ver->preceeding->apihash, APIHASH_SIZE);
    }

    somehash_final(&state, ver->apihash);
}

void compute_api_hashes(void)
{
    struct VersionDecl *ver = mod_declared_versions_list;

    for (; ver; ver = ver->next) {
        compute_api_hash(ver);
    }
}

/*

 TODO api hash computation.

  pre-requisites:
  1. process exported symbols in this order:
    a. version order.
    b. then kind of symbol (type vs top-level function vs constant/variable)
    c. then alphabetical order.
  2. blake3, in a portable way (ANSI C).
    - portable SlulInt instead of uint32_t, with masks to strip high bits
      if > 32 bits (test this with 64 bits!)
    - portable unsigned char instead of uint_8, with masks to strip high bits
      if > 8 bits (test this with 16 bits or more).
  2 (alternative option:) use blake2s, since the input is probably small (make
     a guesstimate) and the complexity / I-cache usage / etc. might not be
     worth the possible speedup of blake3.
        - and it has a (finalised) RFC:  RFC-7693

  regarding blake2s:

    - could use the the following as the key:
        previous API-hash (or all zeros)
            - NOTE: the key is reserved but NOT defined in RFC-7693!
    - there's an IETF RFC, but it's not a NIST standard.
      also, the RFC is "informational".
      could this be a problem?
    - sometimes blake3 and/or blake2b is supported but not blake2s
      (for example, in the CycloneDX spec). are people moving away from
      blake2s?
        - but on the other hand, CycloneDX has only "blake3" which i think is
          the 512-bit version :(
          but it has several output lengths for blake2b

  regarding SHA-256:

  - it is perhaps considered "more standard" than blake2s, even though the
    latter has an RFC.
  - it is apparently often significantly faster than blake3/2b/2s due to being
    supported in hardware in many processors.
    (but on modern x86_64 there seems to be little difference,
    on 64 .. 4096 byte datasets at least)
  - however, is it as secure? compare security levels of SHA-256 vs blake2s-256
  - a more future-proof hash is preferred since it will be very difficult to
    change.

  regarding SHA-512/256 (truncated SHA-512)

  - this avoids length-extension attacks
  - 64-bit word size
  - standardized and should be future-proof security wise.
  - is is NOT simply truncated. It uses a different initialisation!
  - slow. how many % of the compilation time is used on hashing interfaces?
  - how complex is the code?

  regarding SHA-3 / SHAKE-256:

   - SHAKE-256 is somehow based on Keccak (which SHA-3 is based on as well)
        - looks like SHAKE-256 is the fixed-size output version,
          while SHA-3 is the variable-sized output version.
   - SHAKE-256 appears to be slower than blake2 (not sure if b or s or both)
   - there's a KangarooTwelwe function that is faster, for smaller inputs
        - I assume that this means that SHA-3 is slow on small inputs?
          (and API-hash-preimages will often be small)

  regarding the hash pre-image / "to-be-hashed-data":

   - all items in lists must start with non-zero byte!
     (since zero is used as a terminator byte)
   - (non-closed) classes CAN be repeated to extend them with more fields.
     same for enums.

 */