bootstrap/escape.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97


/*
 * String escape handling.
 *
 * Copyright © 2026 Samuel Lidén Borell <samuel@kodafritt.se>
 *
 * SPDX-License-Identifier: EUPL-1.2+ OR LGPL-2.1-or-later
 */
#include <stdlib.h>
#include "token.h"

static unsigned hexchar(char c)
{
    if (c >= '0' && c <= '9') {
        return (unsigned)(c - '0');
    } else if (c >= 'a' && c <= 'f') {
        return (unsigned)(c - 'a') + 10;
    } else if (c >= 'A' && c <= 'F') {
        return (unsigned)(c - 'A') + 10;
    } else {
        error("Invalid hex character");
        return (unsigned)-1; /* silence warning from tcc */
    }
}

void unescape_string(const struct LexemeInfo *li,
                     const char **str_out, size_t *len_out)
{
    const char *si;
    char *so;
    size_t inlen = li->len;
    size_t outlen;

    if (!inlen) {
        *str_out = NULL;
        *len_out = 0;
        return;
    }
    si = li->string;
    so = malloc(inlen);
    NO_NULL(so);
    *str_out = so;

    outlen = 0;
    while (inlen--) {
        char c = *(si++);
        if (c != '\\') {
          one_char_out:
            *(so++) = c;
            outlen++;
        } else if (inlen-- != 0) {
            c = *(si++);
            switch (c) {
                case '"':
                case '\\':
                    goto one_char_out;
                case 'n':
                    c = '\n';
                    goto one_char_out;
                /* XXX how many escapes should there be?
                       some are quite uncommon. */
                case 'r':
                    c = '\r';
                    goto one_char_out;
                case 't':
                    c = '\t';
                    goto one_char_out;
                case '0':
                    c = '\0';
                    goto one_char_out;
                case 'x': {
                    unsigned high, low;
                    if (inlen < 2) {
                        error("Unexpected end of string in escape sequence");
                    }
                    high = hexchar(si[0]);
                    low  = hexchar(si[1]);
                    si += 2;
                    inlen -= 2;
                    c = (char)((high << 4U) | low);
                    goto one_char_out; }
                /* TODO unicode escapes.
                    They should emit UTF+8 bytes.
                    Which syntax to use?
                        - variable length \u with up to 6 hexdigits?
                        - fixed-length \u with 4 and \U with 6 hexdigits?
                        - semicolon-terminated \u123;
                    Or skip unicode escapes, and rely on UTF-8 byte escapes?
                 */
                default:
                    error("Invalid escape sequence");
            }
        } else {
            error("Unexpected end of string in escape sequence");
        }
    }
    *len_out = outlen;
}