/* token.c -- Tokenization of source Copyright © 2021-2024 Samuel Lidén Borell Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "internal.h" #include "hash.h" #include "tokencase.h" #include #include #define INTERR_TOKEN(errnum) MAKE_INTERR(errnum, INTERRBASE_TOKEN) #define INTERR_BADCHARTYPE INTERR_TOKEN(0x01) #define INTERR_BADSCRIPT INTERR_TOKEN(0x03) void error_char(struct CSlul *ctx, const char *bp, enum CSlulErrorCode errorcode) { error_char_offs(ctx, bp, 0, errorcode); } void error_char_offs(struct CSlul *ctx, const char *bp, int offset, enum CSlulErrorCode errorcode) { static const char hexchars[16] = { '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; int length; const char *chartext; if (errorcode != CSLUL_E_UNEXPECTEDEOF && errorcode != CSLUL_E_NOEOFNEWLINE && bp != ctx->bufferend) { unsigned char ch = *bp; if (ch >= 32 && ch < 127) { chartext = bp; length = 1; } else { char *buff = &ctx->msg_charbuff[0]; buff[0] = '\\'; buff[1] = 'x'; buff[2] = hexchars[ch >> 4]; buff[3] = hexchars[ch & 0xF]; chartext = buff; length = 4; } } else { /* Special handling for EOF */ chartext = NULL; length = 0; } error_textlen(ctx, errorcode, ctx->line, ctx->startcolumn + (bp - ctx->linestart) - ctx->mbtrailerbytes + offset, chartext, length); } /** * Skips a sequence of non-ASCII characters. Errors are reported for invalid * UTF-8, such as overlong encodings, and for disallowed codepoints (e.g. * control and surrogate characters). * * This function handles UTF-8 characters that span across multiple buffers. * * The number of "trailer bytes" are tracked, so the column number can be * calculated by subtracting the number of trailer bytes. A "trailer byte" * is a byte, in a valid UTF-8 character, that is not the first byte. */ const char *skip_utf8(struct CSlul *ctx, const char *bp, const char *bend) { unsigned char c; uint32 code; /* If we are at the start of a buffer, there might be some UTF-8 byte from the previous buffer that we need to take into account */ if (UNLIKELY(ctx->utf8state)) { assert(bp != bend); c = ctx->utf8byte; code = ctx->utf8code; switch (ctx->utf8state) { case UTF8ST_BYTE2: goto continue_byte2; case UTF8ST_MB3_BYTE3: goto continue_mb3_byte3; case UTF8ST_MB4_BYTE3: goto continue_mb4_byte3; case UTF8ST_MB4_BYTE4: goto continue_mb4_byte4; case UTF8ST_NONE: ; /* Should never happen */ } } while (bp != bend) { c = (unsigned char)*bp; if (c < 128) break; /* Plain ASCII character */ if (UNLIKELY((c & 0xc0) == 0x80)) { /* Not a valid start byte */ error_char(ctx, bp, CSLUL_E_BADUTF8); bp++; continue; } else if (UNLIKELY(bp+1 == bend)) { ctx->utf8state = UTF8ST_BYTE2; ctx->utf8byte = c; if (ctx->last_buffer) { error_char(ctx, bp, CSLUL_E_BADUTF8); } bp++; goto splitted; } else bp++; continue_byte2: code = 0; if ((c & 0xe0) == 0xc0) { /* 2 byte character */ code |= (c & 0x1fU) << 6U; c = *bp; if (UNLIKELY((c & 0xc0) != 0x80)) { error_char(ctx, bp, CSLUL_E_BADUTF8); continue; } code |= (c & 0x3fU); ctx->mbtrailerbytes++; if (UNLIKELY(code <= 0x7F)) { /* Overlong encoding */ error_char(ctx, bp, CSLUL_E_BADUTF8); ctx->mbtrailerbytes--; continue; } bp++; } else if ((c & 0xf0) == 0xe0) { /* 3 byte character */ code |= (uint32)(c & 0x0fU) << 12U; c = *bp; if (UNLIKELY((c & 0xc0) != 0x80)) { error_char(ctx, bp, CSLUL_E_BADUTF8); continue; } code |= (c & 0x3fU) << 6U; ctx->mbtrailerbytes++; if (UNLIKELY(code <= 0x7FF)) { /* Overlong encoding */ error_char(ctx, bp, CSLUL_E_BADUTF8); ctx->mbtrailerbytes--; continue; } if (UNLIKELY(bp+1 == bend)) { ctx->utf8state = UTF8ST_MB3_BYTE3; ctx->utf8code = code; if (ctx->last_buffer) { error_char(ctx, bp, CSLUL_E_BADUTF8); } bp++; goto splitted; } bp++; continue_mb3_byte3: c = *bp; ctx->mbtrailerbytes++; if (UNLIKELY((c & 0xc0) != 0x80)) { error_char(ctx, bp, CSLUL_E_BADUTF8); ctx->mbtrailerbytes -= 2; continue; } code |= c & 0x3f; bp++; } else /*if ((c & 0xf0) == 0xf0)*/ { /* 4 byte character */ code |= (uint32)(c & 0x07U) << 18U; c = *bp; if (UNLIKELY((c & 0xc0) != 0x80)) { error_char(ctx, bp, CSLUL_E_BADUTF8); continue; } code |= (uint32)(c & 0x3fU) << 12U; ctx->mbtrailerbytes++; if (UNLIKELY(code <= 0xFFFF)) { /* Overlong encoding */ error_char(ctx, bp, CSLUL_E_BADUTF8); ctx->mbtrailerbytes--; continue; } if (UNLIKELY(bp+1 == bend)) { ctx->utf8state = UTF8ST_MB4_BYTE3; ctx->utf8code = code; if (ctx->last_buffer) { error_char(ctx, bp, CSLUL_E_BADUTF8); } bp++; goto splitted; } bp++; continue_mb4_byte3: c = *bp; ctx->mbtrailerbytes++; if (UNLIKELY((c & 0xc0) != 0x80)) { error_char(ctx, bp, CSLUL_E_BADUTF8); ctx->mbtrailerbytes -= 2; continue; } code |= (c & 0x3fU) << 6U; if (UNLIKELY(bp+1 == bend)) { ctx->utf8state = UTF8ST_MB4_BYTE4; ctx->utf8code = code; if (ctx->last_buffer) { error_char(ctx, bp, CSLUL_E_BADUTF8); } bp++; goto splitted; } bp++; continue_mb4_byte4: c = *bp; ctx->mbtrailerbytes++; if (UNLIKELY((c & 0xc0) != 0x80)) { error_char(ctx, bp, CSLUL_E_BADUTF8); ctx->mbtrailerbytes -= 3; continue; } code |= c & 0x3f; bp++; } /* Check what these characters actually do. Maybe some of them can be allowed. */ if (UNLIKELY( (code >= 0x80 && code <= 0x9F) || /* control characters */ (code >= 0xD800 && code <= 0xDFFF) || /* surrogate characters */ (code >= 0x2028 && code <= 0x202E) || /* line sep., RLO, etc */ (code >= 0x2060 && code <= 0x206F) || /* digit override etc */ (code >= 0xFDD0 && code <= 0xFDEF) || /* non-characters */ code == 0xFEFF || /* Byte Order Mark / ZWNBSP */ ((code & 0xFFFE) == 0xFFFE) || /* ??FFFE-FFFF are non-characters */ code > 0x10FFFF)) { /* Last Unicode character */ /* Disallowed to prevent deceptive source text */ error_char_offs(ctx, bp, -1, CSLUL_E_DISALLOWEDUNICODE); } else if (ctx->allowed_scripts != (SCRIPT_ALL|SCRIPT_RTL)) { unsigned script = get_unicode_script(code); if (UNLIKELY((script & ~ctx->allowed_scripts) != 0)) { enum CSlulErrorCode err; switch (script) { case SCRIPT_LATIN: err = CSLUL_E_SCRIPTLATIN; break; case SCRIPT_CYRILLIC: err = CSLUL_E_SCRIPTCYRILLIC; break; case SCRIPT_GREEK: err = CSLUL_E_SCRIPTGREEK; break; case SCRIPT_SPECIALS: err = CSLUL_E_SCRIPTSPECIALS; break; case SCRIPT_OTHER: err = CSLUL_E_SCRIPTOTHER; break; case SCRIPT_RTL: err = CSLUL_E_SCRIPTRTL; break; default: internal_error(ctx, INTERR_BADSCRIPT); goto interr; } error_char_offs(ctx, bp, -1, err); ctx->allowed_scripts |= script; /* Silence further errors */ interr: ; } } } ctx->utf8state = UTF8ST_NONE; splitted: return bp; } /** * Like skip_utf8, but allows bounding the string range * \param ctx Compilation context * \param bp Current character pointer * \param stop Bounding end pointer * \param at_end If this the end pointer is at EOF * \return New character pointer */ static const char *skip_utf8_bounded(struct CSlul *ctx, const char *bp, const char *stop, int at_end) { const char *ret; int savedlast = ctx->last_buffer; ctx->last_buffer = at_end; ret = skip_utf8(ctx, bp, stop); ctx->last_buffer = savedlast; return ret; } /** * Reports an error about an unexpected UTF-8 character. * Returns a pointer to the position past the UTF-8 character. */ const char *unexpected_utf8(struct CSlul *ctx, const char *bp, const char *bend) { error_char(ctx, bp, CSLUL_E_INVALIDCHAR); ctx->allowed_scripts = SCRIPT_ALL|SCRIPT_RTL; return skip_utf8(ctx, bp, bend); } #define X 0 #define N ((unsigned char)CSLUL_T_Integer) #define U ((unsigned char)CSLUL_T_UpperIdent) #define L ((unsigned char)CSLUL_T_LowerIdent) static const unsigned char char2tok[128] = { /* -------------------- 0x00 - 0x0F -------------------- */ X, X, X, X, X, X, X, X, X, CSLUL_INT_Whitespace, CSLUL_T_Newline, X, X, CSLUL_T_Newline, X, X, /* -------------------- 0x10 - 0x1F -------------------- */ X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* -------------------- 0x20 - 0x2F -------------------- */ CSLUL_INT_Whitespace, /* ! */ CSLUL_T_Exclamation, /* " */ CSLUL_T_String, /* # */ CSLUL_INT_Comment, /* $ */ X, /* % */ X, /* & */ X, /* ' */ X, /* ( */ CSLUL_T_LParen, /* ) */ CSLUL_T_RParen, /* * */ CSLUL_T_Asterisk, /* + */ CSLUL_T_Plus, /* , */ CSLUL_T_Comma, /* - */ CSLUL_T_Minus, /* . */ CSLUL_T_Dot, /* / */ CSLUL_T_Slash, /* -------------------- 0x30 - 0x3F -------------------- */ /* 0-9 */ N, N, N, N, N, N, N, N, N, N, /* : */ CSLUL_T_Colon, /* ; */ CSLUL_T_Semicolon, /* < */ CSLUL_T_Less, /* = */ CSLUL_T_Assign, /* > */ CSLUL_T_Greater, /* ? */ CSLUL_T_Question, /* -------------------- 0x40 - 0x4F -------------------- */ /* @ */ X, /* A-O */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, /* -------------------- 0x50 - 0x5F -------------------- */ /* P-Z */ U, U, U, U, U, U, U, U, U, U, U, /* [ */ CSLUL_T_LSquare, /* \ */ X, /* ] */ CSLUL_T_RSquare, /* ^ */ X, /* _ */ L, /* -------------------- 0x60 - 0x6F -------------------- */ /* ` */ X, /* a-o */ L, L, L, L, L, L, L, L, L, L, L, L, L, L, L, /* -------------------- 0x70 - 0x7F -------------------- */ /* p-z */ L, L, L, L, L, L, L, L, L, L, L, /* { */ CSLUL_T_LCurly, /* | */ X, /* } */ CSLUL_T_RCurly, /* ~ */ X, /* 0x7F */ X }; #undef L #undef U #undef N /** Appends data to the temporary buffer. The caller is reponsible for ensuring that the data is not longer than MAX_TOKEN_LEN */ static void copytotmp(struct CSlul *ctx, const char *start, const char *end) { size_t len = end - start; assert(start <= end); assert(ctx->tmplen+len <= MAX_TOKEN_LEN); memcpy(&ctx->toktmp[ctx->tmplen], start, len); ctx->tmplen += len; } static void copytotmp_safe(struct CSlul *ctx, const char *start, const char *end) { size_t len = end - start; assert(start != NULL); assert(start <= end); assert(end <= ctx->bufferend); if (UNLIKELY(len >= MAX_TOKEN_LEN || len + ctx->tmplen >= MAX_TOKEN_LEN)) { end = start + (MAX_TOKEN_LEN - ctx->tmplen); assert(end >= start); error_char(ctx, end, CSLUL_E_STRINGTOOLONG); } copytotmp(ctx, start, end); } static int valid_ident_char(unsigned char ch, enum IdentParseMode mode) { enum CSlulToken tok; if (UNLIKELY(ch > 127)) return 0; tok = (enum CSlulToken)char2tok[(unsigned)ch]; return LIKELY(tok == CSLUL_T_UpperIdent || tok == CSLUL_T_LowerIdent || tok == CSLUL_T_Integer) || (mode != ParseIdent && ch == '.') || (mode == ParseVersion && ch == '~'); } /** * Tokenizes an identifier or attribute part. Sets *status to 1 if successful. * Identifiers than span across buffers are handled by storing identifier * bytes in ctx->toktmp. * * The "mode" parameter controls how parsing should be done: * - ParseIdent: Only alphanumeric characters and underscore allowed. * - ParseVersion: Alphanumeric characters and ._~ allowed * (but _ is forbidden in versions!) * - ParseAttrName: Like ParseIdent, but dots are also allowed. * - ParseAttrValue: Parse until whitespace (space/tab/newline). * Invalid UTF-8 or control characters generate an error. */ const char *tokenize_ident(struct CSlul *ctx, const char *bp, const char *bend, int *status, enum IdentParseMode mode) { /* FIXME this function reports a "too long" error for idents that are exactly 100 bytes! */ const char *start = bp; size_t len = ctx->tmplen; HashCode hash = ctx->tokhash; unsigned hash_or = (ctx->case_insens ? 0x20 : 0); const char *stop; assert(bp <= bend); stop = ((size_t)(bend - bp) > (size_t)MAX_IDENT_LEN-len ? bp+(MAX_IDENT_LEN-len) : bend); if (ctx->toklen == MAX_IDENT_LEN) goto too_long; assert(bp <= stop); assert(stop <= bend); assert(ctx->toklen <= MAX_IDENT_LEN); if (mode != ParseAttrValue) { for (;;) { unsigned char ch; if (UNLIKELY(bp == stop)) goto stopped; ch = *bp; if (!valid_ident_char(ch, mode)) break; bp++; hash = HASH(hash, ch | hash_or); } } else { /* Parsing value in module header */ for (;;) { unsigned char ch; if (UNLIKELY(bp == stop)) goto stopped; ch = *bp; if (ch <= 0x20 || ch == 127) { if (LIKELY(ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t')) break; error_char(ctx, bp, CSLUL_E_INVALIDCHAR); bp++; } else if (LIKELY(ch < 127)) { hash = HASH(hash, ch | hash_or); bp++; } else { /* UTF-8 character */ hash = HASH(hash, 0); bp = skip_utf8_bounded(ctx, bp, stop, (ctx->last_buffer && bend==stop)); } } } done: *status = 1; len = ctx->tmplen; if (UNLIKELY(len)) { copytotmp(ctx, start, bp); ctx->tokval = (const char*)&ctx->toktmp; } else { ctx->tokval = start; } ctx->toklen = (bp-start) + len; ctx->tokhash = hash; return bp; stopped: /* We either reached the end, or the ident is too long */ ctx->toklen += bp-start; assert(bp >= start); if ((size_t)(bp - start) >= MAX_IDENT_LEN - ctx->tmplen) goto too_long; if (bp != bend) goto too_long; /* End of buffer */ if (ctx->last_buffer) goto done; *status = 0; copytotmp(ctx, start, bp); ctx->tokhash = hash; assert(ctx->toklen > 0); return bp; too_long: assert(ctx->toklen == MAX_IDENT_LEN); if (ctx->tmplen) { assert(ctx->tmplen + (bp-start) == MAX_IDENT_LEN); copytotmp(ctx, start, bp); ctx->tokval = (const char*)&ctx->toktmp; } else { ctx->tokval = start; } if (UNLIKELY(bp-start)) { /* will be 0 if this is a second chunk (= report once) */ error_char(ctx, bp, CSLUL_E_IDENTTOOLONG); } ctx->tokhash = hash; /* Skip the oversized identifier/value */ if (mode != ParseAttrValue) { for (;;) { if (bp == bend) goto buffer_end_toolong; if (!valid_ident_char(*bp, mode)) break; bp++; } } else { unsigned char ch; for (;;) { if (bp == bend) goto buffer_end_toolong; ch = *bp; if (ch <= 0x20 || ch == 127) { if (LIKELY(ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t')) break; error_char(ctx, bp, CSLUL_E_INVALIDCHAR); bp++; } else if (ch < 127) { bp++; } else { /* UTF-8 character */ bp = skip_utf8(ctx, bp, bend); if (bp == bend) goto buffer_end_toolong; } } } identend_toolong: assert(ctx->toklen == MAX_IDENT_LEN); *status = 1; /* pretend it was successful for error recovery */ return bp; buffer_end_toolong: if (ctx->last_buffer) goto identend_toolong; assert(ctx->toklen == MAX_IDENT_LEN); *status = 0; return bp; } /** Resets identifier state to the starting state. */ void ident_start(struct CSlul *ctx) { ctx->toklen = 0; ctx->tmplen = 0; ctx->tokhash = 0; } void token_start(struct CSlul *ctx, const char *bp) { ctx->prev_tok_line = ctx->tokline; ctx->prev_tok_col = ctx->tokcolumn; ctx->prev_tok_endcol = ctx->tokcolumn+ctx->toklen; ctx->tokline = ctx->line; ctx->tokcolumn = ctx->startcolumn + (bp - ctx->linestart - ctx->mbtrailerbytes); } void token_eof(struct CSlul *ctx, const char *bp) { if (UNLIKELY(bp != ctx->bufferstart && bp[-1] != '\n' && bp[-1] != '\r')) { error_char(ctx, bp, CSLUL_E_NOEOFNEWLINE); } if (UNLIKELY(ctx->in_multiline_comment)) { error_linecol(ctx, CSLUL_E_MLCOMMENTNOTCLOSED, ctx->multilinecomment_startline, 1); } ctx->tokcolumn += ctx->toklen; ctx->toklen = 0; ctx->prev_tok_line = ctx->tokline; ctx->prev_tok_endcol = ctx->tokcolumn; } static enum CSlulToken match_keyword(struct CSlul *ctx) { size_t toklen = ctx->toklen; const char *tokval = ctx->tokval; assert(ctx->toklen >= 1); switch (ctx->tokhash) { /* TODO change "case" -> "matches" (to have 8-space indentation) TODO add "unreachable" in addition to "undef" */ case H_NOT: TOK_EQ_RETURN("not", CSLUL_T_KW_Not) case H_AND: TOK_EQ_RETURN("and", CSLUL_T_KW_And) case H_OR: TOK_EQ_RETURN("or", CSLUL_T_KW_Or) case H_MOD: TOK_EQ_RETURN("mod", CSLUL_T_KW_Mod) case H_DEREF: TOK_EQ_RETURN("deref", CSLUL_T_KW_Deref) case H_REFTO: TOK_EQ_RETURN("refto", CSLUL_T_KW_RefTo) case H_REF_IS: TOK_EQ_RETURN("ref_is", CSLUL_T_KW_RefIs) case H_REF_IS_NOT: TOK_EQ_RETURN("ref_is_not", CSLUL_T_KW_RefIsNot) case H_DATA: TOK_EQ_RETURN("data", CSLUL_T_KW_Data) case H_FUNC: TOK_EQ_RETURN("func", CSLUL_T_KW_Func) case H_TYPE: TOK_EQ_RETURN("type", CSLUL_T_KW_Type) case H_BOOL: TOK_EQ_RETURN("bool", CSLUL_T_KW_Bool) case H_USIZE: TOK_EQ_RETURN("usize", CSLUL_T_KW_USize) case H_SSIZE: TOK_EQ_RETURN("ssize", CSLUL_T_KW_SSize) case H_FILEOFFS: TOK_EQ_RETURN("fileoffs", CSLUL_T_KW_FileOffs) case H_STRING: TOK_EQ_RETURN("string", CSLUL_T_KW_String) case H_INT8: TOK_EQ_RETURN("int8", CSLUL_T_KW_Int8) case H_BYTE: TOK_EQ_RETURN("byte", CSLUL_T_KW_Byte) case H_WUINT8: TOK_EQ_RETURN("wuint8", CSLUL_T_KW_WUInt8) case H_INT16: TOK_EQ_RETURN("int16", CSLUL_T_KW_Int16) case H_UINT16: TOK_EQ_RETURN("uint16", CSLUL_T_KW_UInt16) case H_WUINT16: TOK_EQ_RETURN("wuint16", CSLUL_T_KW_WUInt16) case H_INT: TOK_EQ_RETURN("int", CSLUL_T_KW_Int) case H_UINT: TOK_EQ_RETURN("uint", CSLUL_T_KW_UInt) case H_WUINT: TOK_EQ_RETURN("wuint", CSLUL_T_KW_WUInt) case H_INT32: TOK_EQ_RETURN("int32", CSLUL_T_KW_Int32) case H_UINT32: TOK_EQ_RETURN("uint32", CSLUL_T_KW_UInt32) case H_WUINT32: TOK_EQ_RETURN("wuint32", CSLUL_T_KW_WUInt32) case H_INT64: TOK_EQ_RETURN("int64", CSLUL_T_KW_Int64) case H_UINT64: TOK_EQ_RETURN("uint64", CSLUL_T_KW_UInt64) case H_WUINT64: TOK_EQ_RETURN("wuint64", CSLUL_T_KW_WUInt64) case H_REF: TOK_EQ_RETURN("ref", CSLUL_T_KW_Ref) case H_OWN: TOK_EQ_RETURN("own", CSLUL_T_KW_Own) case H_ARENA: TOK_EQ_RETURN("arena", CSLUL_T_KW_Arena) case H_SLOT: TOK_EQ_RETURN("slot", CSLUL_T_KW_Slot) case H_FUNCREF: TOK_EQ_RETURN("funcref", CSLUL_T_KW_FuncRef) case H_NORETURN: TOK_EQ_RETURN("noreturn", CSLUL_T_KW_NoReturn) case H_STRUCT: TOK_EQ_RETURN("struct", CSLUL_T_KW_Struct) case H_ENUM: TOK_EQ_RETURN("enum", CSLUL_T_KW_Enum) case H_LIFETIME: TOK_EQ_RETURN("lifetime", CSLUL_T_KW_Lifetime) case H_SINCE: TOK_EQ_RETURN("since", CSLUL_T_KW_Since) case H_VAR: TOK_EQ_RETURN("var", CSLUL_T_KW_Var) case H_WRITEONLY: TOK_EQ_RETURN("writeonly", CSLUL_T_KW_WriteOnly) case H_ALIASED: TOK_EQ_RETURN("aliased", CSLUL_T_KW_Aliased) case H_THREADED: TOK_EQ_RETURN("threaded", CSLUL_T_KW_Threaded) case H_CLOSED: TOK_EQ_RETURN("closed", CSLUL_T_KW_Closed) case H_NONE: TOK_EQ_RETURN("none", CSLUL_T_KW_None) case H_THIS: TOK_EQ_RETURN("this", CSLUL_T_KW_This) case H_UNDEF: TOK_EQ_RETURN("undef", CSLUL_T_KW_Undef) case H_FALSE: TOK_EQ_RETURN("false", CSLUL_T_KW_False) case H_TRUE: TOK_EQ_RETURN("true", CSLUL_T_KW_True) case H_IF: TOK_EQ_RETURN("if", CSLUL_T_KW_If) case H_ELSE: TOK_EQ_RETURN("else", CSLUL_T_KW_Else) case H_WHILE: TOK_EQ_RETURN("while", CSLUL_T_KW_While) case H_DO: TOK_EQ_RETURN("do", CSLUL_T_KW_Do) case H_FOR: TOK_EQ_RETURN("for", CSLUL_T_KW_For) case H_IN: TOK_EQ_RETURN("in", CSLUL_T_KW_In) case H_LOOPEND: TOK_EQ_RETURN("loopend", CSLUL_T_KW_LoopEnd) case H_LOOPEMPTY: TOK_EQ_RETURN("loopempty", CSLUL_T_KW_LoopEmpty) case H_SWITCH: TOK_EQ_RETURN("switch", CSLUL_T_KW_Switch) case H_CASE: TOK_EQ_RETURN("case", CSLUL_T_KW_Case) case H_WITH: TOK_EQ_RETURN("with", CSLUL_T_KW_With) case H_DEFAULT: TOK_EQ_RETURN("default", CSLUL_T_KW_Default) case H_SUBCASE: TOK_EQ_RETURN("subcase", CSLUL_T_KW_SubCase) case H_ASSERT: TOK_EQ_RETURN("assert", CSLUL_T_KW_Assert) case H_BREAK: TOK_EQ_RETURN("break", CSLUL_T_KW_Break) case H_CONTINUE: TOK_EQ_RETURN("continue", CSLUL_T_KW_Continue) case H_GOTO: TOK_EQ_RETURN("goto", CSLUL_T_KW_Goto) case H_RETURN: TOK_EQ_RETURN("return", CSLUL_T_KW_Return) } return tokval[0] >= 'a' || tokval[0] == '_' ? CSLUL_T_LowerIdent : CSLUL_T_UpperIdent; } /** * Returns 1 if the token is not a known keyword, and could syntactically * be a module header keyword (i.e. is lowercase). */ int token_could_be_mh_attr(struct CSlul *ctx) { return match_keyword(ctx) == CSLUL_T_LowerIdent && !(ctx->toklen == 4 && ctx->tokhash == H_VOID); /* C programmer?! */ } enum CSlulToken cslul_ll_next_slul_token(struct CSlul *ctx) { const char *bp = ctx->buffer; const char *bend = ctx->bufferend; unsigned char ch; enum CSlulToken tok; enum TokenState tokstate = TDone; ctx->linestart = bp; ctx->mbtrailerbytes = 0; if (ctx->reused_token.slul) { tok = ctx->reused_token.slul; ctx->reused_token.slul = 0; return tok; } if (UNLIKELY(ctx->tokstate.slul != TDone)) { switch (ctx->tokstate.slul) { case TInNewline: goto in_newline; case TInWhitespace: goto in_whitespace; case TInComment: goto in_comment; case TInMaybeMLCommentStart1: goto in_maybe_ml_comment_start1; case TInMaybeMLCommentStart2: goto in_maybe_ml_comment_start2; case TInMaybeMLCommentEnd0: goto in_maybe_ml_comment_end0; case TInMaybeMLCommentEnd1: goto in_maybe_ml_comment_end1; case TInMaybeMLCommentEnd2: goto in_maybe_ml_comment_end2; case TInIdent: goto in_ident; case TInVersion: goto in_version; case TInOperator: goto in_operator; case TInString: goto in_string; case TEscapeStart: goto escape_start; case TEscapeHex: goto escapehex; case TEscapeUnicode: goto escapeunicode; case TEscapeScripts: goto escapescripts; case TZeroPrefixed: goto zeroprefixed; case TNumberHex: goto number_hex; case TNumberBin: goto number_bin; case TNumberDec: goto number_dec; case TNumberExpSign: goto number_expsign; case TNumberExp: goto number_exp; case TNumberSkip: goto number_skip; case TDone:; } } if (UNLIKELY(ctx->utf8state)) { if (bp == bend) goto buffer_end; bp = skip_utf8(ctx, bp, bend); } nextchar: if (UNLIKELY(bp == bend)) goto buffer_end; ch = *bp; havechar: if (UNLIKELY(ch > 127)) { bp = unexpected_utf8(ctx, bp, bend); goto nextchar; } tok = (enum CSlulToken)char2tok[(unsigned)ch]; switch (tok) { case X: error_char(ctx, bp, CSLUL_E_INVALIDCHAR); bp++; goto nextchar; case CSLUL_T_Newline: newline_start: bp++; if (LIKELY(ch == '\n')) { newline_end: ctx->line++; ctx->startcolumn = 1; ctx->mbtrailerbytes = 0; ctx->linestart = bp; ctx->numspaces = 0; if (ctx->in_multiline_comment) goto in_maybe_ml_comment_end0; goto nextchar; } in_newline: if (UNLIKELY(bp == bend)) { /* \r */ if (ctx->last_buffer) { error_char_offs(ctx, bp, -1, CSLUL_E_CRNEWLINE); } tokstate = TInNewline; goto buffer_end; } else { if (LIKELY(*bp == '\n')) bp++; /* \r\n */ else error_char_offs(ctx, bp, -1, CSLUL_E_CRNEWLINE); goto newline_end; } break; /* unreachable */ in_maybe_ml_comment_end0: if (UNLIKELY(bp == bend)) { tokstate = TInMaybeMLCommentEnd0; goto buffer_end; } if (*bp != '#') goto in_comment; bp++; in_maybe_ml_comment_end1: if (UNLIKELY(bp == bend)) { tokstate = TInMaybeMLCommentEnd1; goto buffer_end; } if (*bp != '}') goto in_maybe_ml_comment_start1; bp++; in_maybe_ml_comment_end2: if (UNLIKELY(bp == bend)) { tokstate = TInMaybeMLCommentEnd2; goto buffer_end; } if (*bp == '}') ctx->in_multiline_comment--; bp++; goto in_comment; /* skip to end of line */ case CSLUL_INT_Whitespace: { const char *start; ctx->numspaces = 0; in_whitespace: start = bp; for (;;) { if (UNLIKELY(bp == bend)) goto buffer_end_ws; ch = *bp; if (ch != ' ') break; bp++; } ctx->numspaces += bp-start; if (UNLIKELY(ch == '\t')) { error_char(ctx, bp, CSLUL_E_TAB); /* TODO skip multiple tabs */ bp++; } else if (UNLIKELY(ch == '\n' || ch == '\r')) { int col = ctx->startcolumn + (bp-ctx->linestart); error_char(ctx, bp, (ctx->numspaces == col-1 ? CSLUL_E_INDENTEDBLANKLINE : CSLUL_E_TRAILINGSPACE)); } goto havechar; buffer_end_ws: ctx->numspaces += bp-start; tokstate = TInWhitespace; goto buffer_end; } case CSLUL_INT_Comment: bp++; ctx->allowed_scripts = SCRIPT_ALL|SCRIPT_RTL; ctx->in_multiline_comment = 0; in_maybe_ml_comment_start1: if (UNLIKELY(bp == bend)) { tokstate = TInMaybeMLCommentStart1; goto buffer_end; } if (*bp != '{') goto in_comment; bp++; in_maybe_ml_comment_start2: if (UNLIKELY(bp == bend)) { tokstate = TInMaybeMLCommentStart2; goto buffer_end; } if (*bp == '{') { if (ctx->startcolumn + (bp-ctx->linestart) != 3) { error_linecol(ctx, CSLUL_E_MLCOMMENTNOTLINESTART, ctx->line, ctx->startcolumn + (bp-ctx->linestart) - 2); goto in_comment; } if (!ctx->in_multiline_comment) { ctx->multilinecomment_startline = ctx->line; } ctx->in_multiline_comment++; } bp++; in_comment: for (;;) { while (bp != bend && *bp >= 0x20 && *bp < 127) bp++; if (UNLIKELY(bp == bend)) { tokstate = TInComment; goto buffer_end; } ch = *bp; if (ch == '\n' || ch == '\r') break; if (ch >= 128) { bp = skip_utf8(ctx, bp, bend); } else { error_char(ctx, bp, CSLUL_E_INVALIDCHAR); bp++; } } goto newline_start; case CSLUL_T_UpperIdent: case CSLUL_T_LowerIdent: { int status; token_start(ctx, bp); ident_start(ctx); in_ident: /* FIXME disallow _ in UpperIdent */ bp = tokenize_ident(ctx, bp, bend, &status, ParseIdent); if (LIKELY(status)) { tok = match_keyword(ctx); ctx->tmplen = 0; if (UNLIKELY(ctx->toklen >= 2 && !memcmp(ctx->tokval, "__", 2))) { error_tok(ctx, CSLUL_E_DOUBLEUNDERSCORE); } else if (UNLIKELY(tok == CSLUL_T_UpperIdent && memchr(ctx->tokval, '_', ctx->toklen))) { error_tok(ctx, CSLUL_E_TYPEUNDERSCORE); } else if (tok == CSLUL_T_LowerIdent && LIKELY(bp != bend)) { if (*bp == ':') { tok = CSLUL_T_GotoTarget; bp++; } /* XXX how about [] and <> */ } else if (tok == CSLUL_T_KW_Since) { ctx->parser.slul.version_line = ctx->tokline; } goto have_token; } tokstate = TInIdent; goto buffer_end; } case CSLUL_T_Integer: token_start(ctx, bp); ctx->toklen = 0; if (ctx->parser.slul.version_line == ctx->tokline) { goto version_start; } ctx->parser.slul.number = 0; ctx->parser.slul.numdigits = 0; ctx->parser.slul.floatnum = 0; if (ch == '0') { ctx->parser.slul.has_digits = 0; bp++; zeroprefixed: if (UNLIKELY(bp == bend)) { tok = CSLUL_T_Integer; if (ctx->last_buffer) goto have_number_nocheck; /* zero */ tokstate = TZeroPrefixed; goto buffer_end; } ch = *bp; if (ch == 'x') { bp++; number_hex: tokstate = TNumberHex; tok = CSLUL_T_Integer; for (;; bp++) { if (UNLIKELY(bp == bend)) goto number_eob; ch = *bp; if (ch >= '0' && ch <= '9') ch -= '0'; else if (ch >= 'a' && ch <= 'f') ch = ch-'a' + 0xA; else if (ch >= 'A' && ch <= 'F') ch = ch-'A' + 0xA; else if (ch == '_') continue; else goto have_number; if (ctx->parser.slul.numdigits >= 16) goto numbertoolarge; if (ctx->parser.slul.number || ch) { ctx->parser.slul.numdigits++; } ctx->parser.slul.has_digits = 1; ctx->parser.slul.number = (ctx->parser.slul.number << 4) | ch; } } else if (ch == 'b') { bp++; number_bin: tokstate = TNumberBin; tok = CSLUL_T_Integer; for (;; bp++) { if (bp == bend) goto number_eob; ch = *bp; if (ch == '0' || ch == '1') ch -= '0'; else if (ch == '_') continue; else goto have_number; if (ctx->parser.slul.numdigits >= 64) goto numbertoolarge; if (ctx->parser.slul.number || ch) { ctx->parser.slul.numdigits++; } ctx->parser.slul.has_digits = 1; ctx->parser.slul.number = (ctx->parser.slul.number << 1) | ch; } } else if (ch == '.') { ctx->parser.slul.numdigits = 1; goto zero_dot_x; } else if ((ch >= '0' && ch <= '9') || ch == '_') { /* This could be confused with an octal number */ error_char(ctx, bp, CSLUL_E_LEADINGZERO); goto number_skip; } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { error_char(ctx, bp, CSLUL_E_BADNUMBERTYPE); goto number_skip; } else { /* Zero */ assert(tokstate == TDone); ctx->parser.slul.numdigits = 1; ctx->parser.slul.has_digits = 1; tok = CSLUL_T_Integer; goto have_token; } } else { zero_dot_x: ctx->parser.slul.decpointpos = 0; number_dec: tokstate = TNumberDec; tok = CSLUL_T_Integer; } /* Decimal number */ for (;; bp++) { if (UNLIKELY(bp == bend)) goto number_eob; ch = *bp; if (LIKELY(ch >= '0' && ch <= '9')) { uint64 number = ctx->parser.slul.number; ch -= '0'; /* The 2^64 limit also applies to the integer part of floating point numbers. Use exponents to write larger numbers. Rationale: Large numbers without exponent syntax are hard to read and better avoided. This limitation also simplifies the code somewhat. */ if (UNLIKELY(number >= UINT64_MAX/10)) { /* 2^64-1 = 18446744073709551615 */ if (number > UINT64_MAX/10 || ch > 5) goto numbertoolarge; } if (number || ch) { ctx->parser.slul.numdigits++; } ctx->parser.slul.has_digits = 1; ctx->parser.slul.number = number*10 + ch; if (ctx->parser.slul.floatnum) { ctx->parser.slul.decpointpos++; } } else if (ch == '_') continue; else if (ch == '.') { if (ctx->parser.slul.floatnum) goto double_decpoint; ctx->parser.slul.floatnum = 1; ctx->parser.slul.exponent = 0; ctx->parser.slul.has_digits = 0; continue; } else if (ch == 'e' || ch == 'E') { if (!ctx->parser.slul.has_digits) goto no_frac_digits; ctx->parser.slul.exponent = 0; bp++; goto number_expsign; } else { tok = ctx->parser.slul.floatnum ? CSLUL_T_Float : CSLUL_T_Integer; goto have_number; } } number_expsign: tok = CSLUL_T_Float; tokstate = TNumberExpSign; if (UNLIKELY(bp == bend)) goto number_eob; ch = *bp; if (ch == '-') { ctx->parser.slul.floatnum = -1; bp++; } else { ctx->parser.slul.floatnum = +1; if (ch == '+') bp++; } ctx->parser.slul.numdigits = 0; number_exp: tok = CSLUL_T_Float; tokstate = TNumberExp; for (;; bp++) { if (UNLIKELY(bp == bend)) goto number_eob; ch = *bp; if (ch < '0' || ch > '9') break; if (UNLIKELY(++ctx->parser.slul.numdigits > 4)) { error_char(ctx, bp, CSLUL_E_EXPONENTTOOLARGE); goto number_skip; } ch -= '0'; ctx->parser.slul.exponent = ctx->parser.slul.exponent*10 + ch; } if (ctx->parser.slul.floatnum < 0) { ctx->parser.slul.exponent = -ctx->parser.slul.exponent; } if (UNLIKELY(!ctx->parser.slul.numdigits)) { error_char(ctx, bp, CSLUL_E_NOEXPDIGITS); goto number_skip; } have_number: /* End of number OR invalid character in number */ if (UNLIKELY((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch == '.') || (ch >= '0' && ch <= '9'))) { error_char(ctx, bp, CSLUL_E_BADNUMBERCHAR); goto number_skip; } else if (UNLIKELY(!ctx->parser.slul.has_digits)) { no_frac_digits: error_char(ctx, bp, ctx->parser.slul.floatnum ? CSLUL_E_NOFRACDIGITS : CSLUL_E_NODIGITS); goto number_skip; } else if (ctx->parser.slul.floatnum) { if (!ctx->parser.slul.number) ctx->parser.slul.exponent = 0; else { ctx->parser.slul.exponent -= ctx->parser.slul.decpointpos; } } /* Fall through */ have_number_nocheck: ctx->toklen = (ctx->startcolumn + (bp - ctx->linestart - ctx->mbtrailerbytes)) - ctx->tokcolumn; tokstate = TDone; goto have_token; /* Error handling */ double_decpoint: error_char(ctx, bp, CSLUL_E_DOUBLEDECPOINT); goto number_skip; numbertoolarge: error_char(ctx, bp, CSLUL_E_NUMBERTOOLARGE); number_skip: tokstate = TNumberSkip; ctx->parser.slul.numdigits = INVALID_NUMBER; for (;; bp++) { if (UNLIKELY(bp == bend)) goto buffer_end; ch = *bp; if (!((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') || ch == '.' || ch == '_')) break; } tok = CSLUL_T_Integer; ctx->parser.slul.numdigits = INVALID_NUMBER; goto have_number_nocheck; number_eob: if (ctx->last_buffer) { if (ctx->parser.slul.numdigits == 0) { error_char(ctx, bp, CSLUL_E_NODIGITS); } tokstate = TDone; goto have_token; } goto buffer_end; case CSLUL_T_String: { /* Try to parse without temp buffer first */ const char *start; token_start(ctx, bp); ctx->tmplen = 0; /* TODO make this value configurable in the module header */ ctx->allowed_scripts = SCRIPT_LATIN; start = ++bp; /* TODO hash strings for de-duplication? or skip and tail merge? */ for (;;) { if (UNLIKELY(bp == bend)) goto incomplete_string; ch = *bp; if (ch == '\\') { copytotmp_safe(ctx, start, bp); break; } else if (ch == '"') { simple_string_end: if ((size_t)(bp - start) > MAX_TOKEN_LEN) { error_char(ctx, bp, CSLUL_E_STRINGTOOLONG); ctx->toklen = MAX_TOKEN_LEN; } else { ctx->toklen = bp - start; } ctx->tokval = start; if (ch == '"') bp++; goto have_token; } else if (LIKELY(ch >= 0x20 && ch < 127)) bp++; else if (ch > 127) { bp = skip_utf8(ctx, bp, bend); } else if (ch == '\n' || ch == '\r') { error_char(ctx, bp, CSLUL_E_UNTERMINATEDSTRING); goto simple_string_end; } else { error_char(ctx, bp, CSLUL_E_INVALIDCHAR); bp++; } } in_string: start = bp; for (;;) { next_string_char: if (UNLIKELY(bp == bend)) goto incomplete_string; ch = *bp; if (ch == '\\') { short maxlen; copytotmp_safe(ctx, start, bp); bp++; escape_start: if (UNLIKELY(bp == bend)) { tokstate = TEscapeStart; goto buffer_end_noeof; } ch = *(bp++); ctx->parser.slul.escape = 0; ctx->parser.slul.escapelen = 0; switch (ch) { case '0': ch = '\0'; break; case 'b': ch = '\b'; break; case 't': ch = '\t'; break; case 'n': ch = '\n'; break; case 'r': ch = '\r'; break; case '\"': ch = '\"'; break; case '\\': ch = '\\'; break; case 'A': case 'C': case 'G': case 'L': case 'O': case 'S': case ';': ctx->allowed_scripts = 0; for (;;) { unsigned add; switch (ch) { case 'A': add = SCRIPT_ALL; break; case 'C': add = SCRIPT_CYRILLIC; break; case 'G': add = SCRIPT_GREEK; break; case 'L': add = SCRIPT_LATIN; break; case 'O': add = SCRIPT_OTHER; break; case 'S': add = SCRIPT_SPECIALS; break; /* FIXME Add a way of allowing RTL text. RTL text needs to go on a separate line, BUT it also needs some additional (non-LTR) characters to indicate that there is an RTL string. - Easiest way might be to have begin_bidi/end_bidi keywords on lines before and after (and only allows string literals in between) */ case ';': start = bp; goto next_string_char; default: bp--; error_char(ctx, bp, (ch >= 'A' && ch <= 'Z' ? CSLUL_E_SCRIPTESCAPEUNKNOWN : CSLUL_E_SCRIPTESCAPEBAD)); start = bp; goto next_string_char; } if ((ctx->allowed_scripts & add) != 0) { error_char_offs(ctx, bp, -1, CSLUL_E_SCRIPTESCAPEDUPL); } else if (add < ctx->allowed_scripts) { error_char_offs(ctx, bp, -1, CSLUL_E_SCRIPTESCAPEORDER); } ctx->allowed_scripts |= add; escapescripts: tokstate = TEscapeScripts; if (bp == bend) goto buffer_end_noeof; ch = *(bp++); } break; case 'x': /* Single byte hex escape */ escapehex: tokstate = TEscapeHex; maxlen = 2; goto long_escape; case 'u': /* Unicode hex escape */ escapeunicode: tokstate = TEscapeUnicode; maxlen = 6; /* 0x10FFFF */ goto long_escape; default: error_char(ctx, bp, CSLUL_E_BADESCAPE); } /* Single char escape like \n */ start = bp; have_escaped_byte: if (LIKELY(ctx->tmplen < MAX_TOKEN_LEN)) { ctx->toktmp[ctx->tmplen] = ch; ctx->tmplen++; } else { error_char_offs(ctx, bp, -1, CSLUL_E_STRINGTOOLONG); } continue; long_escape: for (;;) { if (UNLIKELY(bp == bend)) goto buffer_end_noeof; ch = *bp; if (ch >= '0' && ch <= '9') ch -= '0'; else if (ch >= 'a' && ch <= 'f') ch = ch-'a' + 0xA; else if (ch >= 'A' && ch <= 'F') ch = ch-'A' + 0xA; else break; if (UNLIKELY(++ctx->parser.slul.escapelen > maxlen)) { error_char(ctx, bp, CSLUL_E_ESCAPETOOLONG); break; } ctx->parser.slul.escape = (ctx->parser.slul.escape << 4) | ch; bp++; } start = bp; if (UNLIKELY(ctx->parser.slul.escapelen == 0)) { error_char(ctx, bp, CSLUL_E_MISSINGESCAPE); ch = 0; goto have_escaped_byte; } if (tokstate == TEscapeHex) { /* Single byte */ ch = ctx->parser.slul.escape; goto have_escaped_byte; } else { uint32 code = ctx->parser.slul.escape; unsigned char *res; /* Add UTF-8 bytes */ if (UNLIKELY(code > 0x10FFFF || (code >= 0xD800 && code <= 0xDFFF))) { error_char_offs(ctx, bp, -1, CSLUL_E_BADUNICODEESCAPE); continue; } res = (unsigned char*)&ctx->toktmp[ctx->tmplen]; if (code <= 0x7F) { if (ctx->tmplen >= MAX_TOKEN_LEN-1) goto utf8toolarge; *(res++) = code; ctx->tmplen += 1; continue; } else if (code <= 0x7FF) { if (ctx->tmplen >= MAX_TOKEN_LEN-2) goto utf8toolarge; *(res++) = 0xC0U | (code >> 6U); *(res++) = 0x80U | (code & 0x3FU); ctx->tmplen += 2; continue; } else if (code <= 0xFFFF) { if (ctx->tmplen >= MAX_TOKEN_LEN-3) goto utf8toolarge; *(res++) = 0xE0U | (code >> 12U); *(res++) = 0x80U | ((code >> 6U) & 0x3FU); *(res++) = 0x80U | (code & 0x3FU); ctx->tmplen += 3; continue; } else { if (ctx->tmplen >= MAX_TOKEN_LEN-4) goto utf8toolarge; *(res++) = 0xF0U | (code >> 18U); *(res++) = 0x80U | ((code >> 12U) & 0x3FU); *(res++) = 0x80U | ((code >> 6U) & 0x3FU); *(res++) = 0x80U | (code & 0x3FU); ctx->tmplen += 4; continue; } utf8toolarge: error_char(ctx, bp, CSLUL_E_STRINGTOOLONG); } } else if (ch == '"') { string_end: copytotmp_safe(ctx, start, bp); if (ch == '"') bp++; ctx->tokval = &ctx->toktmp[0]; ctx->toklen = ctx->tmplen; tok = CSLUL_T_String; tokstate = TDone; goto have_token; } else if (LIKELY(ch >= 0x20 && ch < 127)) bp++; else if (ch > 127) { bp = skip_utf8(ctx, bp, bend); } else if (ch == '\n' || ch == '\r') { error_char(ctx, bp, CSLUL_E_UNTERMINATEDSTRING); goto string_end; } else { error_char(ctx, bp, CSLUL_E_INVALIDCHAR); bp++; } } incomplete_string: copytotmp_safe(ctx, start, bp); tokstate = TInString; goto buffer_end_noeof; } case CSLUL_T_LParen: case CSLUL_T_RParen: case CSLUL_T_LSquare: case CSLUL_T_RSquare: case CSLUL_T_LCurly: case CSLUL_T_RCurly: case CSLUL_T_Comma: case CSLUL_T_Dot: case CSLUL_T_Question: case CSLUL_T_Colon: case CSLUL_T_Semicolon: token_start(ctx, bp); bp++; ctx->toklen = 1; goto have_token; case CSLUL_T_Plus: case CSLUL_T_Minus: case CSLUL_T_Asterisk: case CSLUL_T_Slash: case CSLUL_T_Less: case CSLUL_T_Assign: case CSLUL_T_Greater: case CSLUL_T_Exclamation: { enum CSlulToken two_char_first; token_start(ctx, bp); /* We need to check if it's a two character operator. These can span over two buffers! */ two_char_first = tok; bp++; continue_in_operator: if (UNLIKELY(bp == bend)) { if (ctx->last_buffer) { ctx->toklen = 1; tokstate = TDone; tok = two_char_first; error_char(ctx, bp, CSLUL_E_NOEOFNEWLINE); goto have_token; } ctx->parser.slul.two_char_first = two_char_first; tokstate = TInOperator; goto buffer_end; } ch = *bp; if (ch == '=') { /* += -= etc */ bp++; ctx->toklen = 2; tok = two_char_first + (CSLUL_T_PlusAssign - CSLUL_T_Plus); goto have_token; } else if (ch == '>' && two_char_first == CSLUL_T_Minus) { bp++; ctx->toklen = 2; tok = CSLUL_T_RArrow; goto have_token; } else if (UNLIKELY(ch > 127)) { ctx->toklen = 1; bp = unexpected_utf8(ctx, bp, bend); goto nextchar; } else { /* Single character token. Re-use the second character */ ctx->toklen = 1; tok = two_char_first; goto have_token; } break; in_operator: two_char_first = ctx->parser.slul.two_char_first; goto continue_in_operator; } CASE_ALL_KEYWORDS CASE_MULTICHAR_OPS case CSLUL_T_Float: case CSLUL_T_Version: case CSLUL_T_EOF: default: /* This should never happen */ error_char(ctx, bp, INTERR_BADCHARTYPE); bp++; goto nextchar; } /* Version tokens are special. A "since" keyword is always followed by a version. Without this special handling, it would have been parsed as an integer/a float. */ version_start: { int status; ident_start(ctx); ctx->parser.slul.version_line = 0; in_version: bp = tokenize_ident(ctx, bp, bend, &status, ParseVersion); if (LIKELY(status)) { tok = CSLUL_T_Version; ctx->tmplen = 0; goto have_token; } tokstate = TInVersion; goto buffer_end; } /* These are the possible function exit states */ buffer_end_noeof: if (ctx->last_buffer) error_char(ctx, bp, CSLUL_E_UNEXPECTEDEOF); buffer_end: if (ctx->last_buffer) { token_eof(ctx, bp); tok = CSLUL_T_EOF; } else { tok = CSLUL_T_NEEDDATA; } have_token: ctx->buffer = bp; ctx->startcolumn += (bp - ctx->linestart - ctx->mbtrailerbytes); ctx->tokstate.slul = tokstate; return tok; } void cslul_ll_current_value(struct CSlul *ctx, const char **name, size_t *length) { *name = ctx->tokval; *length = ctx->toklen; } #undef X /* undefine for unity builds */