/* tokenizer.c -- Tokenizer/lexer (breaks the source in tokens). Copyright © 2011-2016 Samuel Lidén Borell Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include "tokenizer.h" #include "context_private.h" #include "misc.h" #include "string.h" /* Character table */ #define ST LRL_TT_String #define LP LRL_Sym_LParen #define RP LRL_Sym_RParen #define C LRL_Sym_Comma #define ME LRL_Op_Member #define NS LRL_Sym_NamespaceSep #define AI LRL_Sym_ArrayIndex #define S LRL_Sym_Semicolon #define LT LRL_Op_Less #define EQ LRL_Op_Assign #define GT LRL_Op_Greater #define OV LRL_Op_OptionalValue #define A LRL_Op_AddrOf #define R LRL_Op_Deref #define LB LRL_Sym_LSquare #define RB LRL_Sym_RSquare #define LC LRL_Sym_LCurly #define RC LRL_Sym_RCurly #define o1 LRL_Op_Plus #define o2 LRL_Op_Minus #define o3 LRL_Op_Times #define o4 LRL_Op_Divide #define w LRL_Char_Whitespace #define n LRL_TT_Integer #define i LRL_TT_Ident static const LRLTokenType character_table[128] = { /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, w, w, w, w, w, 0, 0, /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2 */ w, 0,ST,AI, 0, 0, 0, 0,LP,RP,o3,o1, C,o2,ME,o4, /* 3 */ n, n, n, n, n, n, n, n, n, n,NS, S,LT,EQ,GT,OV, /* 4 */ A, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, /* 5 */ i, i, i, i, i, i, i, i, i, i, i,LB, 0,RB, R, i, /* 6 */ 0, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, /* 7 */ i, i, i, i, i, i, i, i, i, i, i,LC, 0,RC, 0, 0, }; #undef ST #undef LP #undef RP #undef C #undef ME #undef NS #undef AI #undef S #undef LT #undef EQ #undef GT #undef OV #undef A #undef R #undef LB #undef RB #undef LC #undef RC #undef o1 #undef o2 #undef o3 #undef o4 #undef w #undef n #undef i typedef struct { size_t length; const char word[11+1]; /* longest keyword in the table below */ LRLTokenType type; } KeywordInfo; #define num_keywords 82 static const KeywordInfo keywords[num_keywords] = { { 5, "alias", LRL_KW_Alias }, { 7, "alignof", LRL_Op_AlignOf }, { 7, "alignas", LRL_KW_AlignAs }, { 6, "assert", LRL_KW_Assert }, { 3, "and", LRL_Op_LAnd }, { 3, "any", LRL_KW_Any }, { 2, "as", LRL_KW_As }, { 2, "at", LRL_KW_At }, { 6, "bitand", LRL_Op_BitAnd }, { 5, "bitor", LRL_Op_BitOr }, { 4, "bits", LRL_KW_Bits }, { 6, "bitxor", LRL_Op_BitXor }, { 5, "break", LRL_KW_Break }, { 4, "case", LRL_KW_Case }, { 5, "compl", LRL_Op_Compl }, { 4, "cond", LRL_KW_Cond }, { 5, "const", LRL_KW_Const }, { 8, "continue", LRL_KW_Continue }, { 8, "declonly", LRL_KW_DeclOnly }, { 7, "default", LRL_KW_Default }, {10, "deprecated", LRL_KW_Deprecated }, { 2, "do", LRL_KW_Do }, { 4, "elif", LRL_KW_Elif }, { 4, "else", LRL_KW_Else }, { 4, "enum", LRL_KW_Enum }, { 8, "enumbase", LRL_Op_EnumBase }, { 6, "export", LRL_KW_Export }, { 4, "flow", LRL_KW_Flow }, { 3, "for", LRL_KW_For }, { 2, "gc", LRL_KW_Gc }, { 4, "goto", LRL_KW_Goto }, { 4, "here", LRL_KW_Here }, { 2, "if", LRL_KW_If }, { 6, "import", LRL_KW_Import }, { 2, "in", LRL_KW_In }, {10, "incomplete", LRL_KW_Incomplete }, { 3, "inf", LRL_TT_Inf }, { 7, "interop", LRL_KW_Interop }, { 2, "is", LRL_KW_Is }, { 5, "label", LRL_KW_Label }, { 5, "leave", LRL_KW_Leave }, { 8, "linkname", LRL_KW_Linkname }, { 5, "local", LRL_KW_Local }, { 9, "loopempty", LRL_KW_LoopEmpty }, { 7, "loopend", LRL_KW_LoopEnd }, { 7, "makeopt", LRL_Op_MakeOpt }, { 4, "mine", LRL_KW_Mine }, { 9, "minsizeof", LRL_Op_MinSizeOf }, { 3, "mod", LRL_Op_Modulo }, { 9, "namespace", LRL_KW_Namespace }, { 3, "NaN", LRL_TT_NaN }, { 4, "none", LRL_TT_None }, { 8, "noreturn", LRL_KW_NoReturn }, { 3, "not", LRL_Op_LNot }, { 2, "of", LRL_KW_Of }, { 8, "offsetof", LRL_Op_OffsetOf }, { 2, "or", LRL_Op_LOr }, { 3, "own", LRL_KW_Own }, { 8, "postcond", LRL_KW_PostCond }, { 7, "precond", LRL_KW_PreCond }, { 7, "private", LRL_KW_Private }, {10, "repeatfrom", LRL_KW_RepeatFrom }, { 6, "return", LRL_KW_Return }, { 6, "shared", LRL_KW_Shared }, { 6, "sizeof", LRL_Op_SizeOf }, { 6, "skipto", LRL_KW_SkipTo }, { 6, "struct", LRL_KW_Struct }, { 6, "switch", LRL_KW_Switch }, { 4, "then", LRL_Op_Then }, {10, "typeassert", LRL_KW_TypeAssert }, { 7, "typedef", LRL_KW_Typedef }, { 9, "undefined", LRL_TT_Undefined }, { 5, "union", LRL_KW_Union }, {11, "unreachable",LRL_KW_Unreachable }, { 6, "unused", LRL_KW_Unused }, { 4, "uses", LRL_KW_Uses }, { 5, "using", LRL_KW_Using }, { 3, "var", LRL_KW_Var }, { 5, "while", LRL_KW_While }, { 4, "with", LRL_KW_With }, { 3, "xor", LRL_Op_LXor }, { 5, "yield", LRL_KW_Yield } }; static LRLTokenType check_ident_keyword_type(const LRLToken *token) { const char *s = token->loc.start; size_t length = token->loc.length; unsigned short i; /* TODO optimize */ for (i = 0; i < num_keywords; i++) { if (keywords[i].length == length && !strncmp(keywords[i].word, s, length)) { /* Match */ return keywords[i].type; } } /* It's not a keyword, so it must be an identifier */ return LRL_TT_Ident; } #define SKIP_INTEGER(ch, source) do {\ while (((ch)=*(source), ((ch)<128 && character_table[ch]==LRL_TT_Integer)) || \ (ch) == '_') { (source)++; } \ } while (0) #define SKIP_HEX_INTEGER(ch, source) do {\ while (((ch)=*(source), ((ch)<128 && character_table[ch]==LRL_TT_Integer)) || \ ((ch) >= 'A' && (ch) <= 'F') || ((ch) >= 'a' && (ch) <= 'f') || \ (ch) == '_') { (source)++; } \ } while (0) /** * Returns the end of a long comment. Handles nested comments. */ static const char *skip_long_comment(const char *source) { size_t nestcount = 0; for (;;) { const char *p = strchr(source, '/'); if (p == NULL) return NULL; if (p != source && p[-1] == '*') { /* End of comment */ source = p+1; if (!nestcount) break; nestcount--; } else if (p[1] == '*') { /* Start of nested comment */ nestcount++; source = p+2; } else { /* Slash inside comment */ source = p+1; } }; return source; } /** * Gets the first token from the string pointed to by sourcePtr, and updates * this pointer such that another call to this function will return the next * token. */ static void tokenize_one(LRLCtx *ctx, LRLToken *token, const char **sourcePtr) { const char *source = *sourcePtr; unsigned char ch; size_t length; LRLTokenType char_type; /* Look for start of token */ do { ch = *(source++); /* Check for comments */ if (ch == '/') { unsigned char ch2 = *source; if (ch2 == '/') { /* Skip line comment */ unsigned char prevch; while (prevch = ch, ch = *++source, ch && ch !='\n' && ch != '\r') { if ((prevch == '/' && ch == '*') || (prevch == '*' && ch == '/')) { lrl_err_char(ctx, LRL_Err_CommentInsideLineComment, source-1); } } source++; } else if (ch2 == '*') { /* Skip long comment */ const char *end = skip_long_comment(source+1); if (end) { source = end+1; ch = source[-1]; } else { /* No closing * / found */ lrl_err_char(ctx, LRL_Err_UnclosedComment, source-1); source += strlen(source); ch = '\0'; } } } /* Look for non-whitespace characters */ char_type = (ch < 128 ? character_table[ch] : LRL_TT_Ident); } while (char_type == LRL_Char_Whitespace); token->loc.start = source-1; /* Check for end of file */ if (ch == '\0') { /* Return an EOF token */ token->loc.length = 0; token->type = LRL_TT_EOF; *sourcePtr = source; return; } /* Check character class of first character */ switch (char_type) { case LRL_TT_Ident: /* Read identifier or keyword */ /* Read until end of identifier */ for (;;) { ch = *source; if (ch < 128 && character_table[ch] != LRL_TT_Ident && character_table[ch] != LRL_TT_Integer) break; source++; } /* Store in token and sourcePtr */ length = source - token->loc.start; token->loc.length = length; token->type = check_ident_keyword_type(token); *sourcePtr = source; return; case LRL_TT_Integer: /* Read number */ token->type = LRL_TT_Integer; /* TODO add support for binary (maybe also octal?) */ if (ch == '0' && *source == 'x') { /* Hexadecimal number */ const char *const start = ++source; SKIP_HEX_INTEGER(ch, source); if (source == start || *start == '_') { lrl_err_char(ctx, LRL_Err_ExpectedHexDigit, start); } } else { /* Decimal number */ /* Read integer part */ SKIP_INTEGER(ch, source); /* Read fractional part (if any) */ if (ch == '.' && ((unsigned char)source[1] < 128) && character_table[(unsigned char)source[1]] == LRL_TT_Integer) { source++; SKIP_INTEGER(ch, source); token->type = LRL_TT_Real; } /* Read exponent part (if any) */ if (ch == 'e' || ch == 'E') { source++; ch = (unsigned char)*source; if (ch >= 128 || character_table[ch] != LRL_TT_Integer) { lrl_err_char(ctx, LRL_Err_ExpectedExponentPart, source); } SKIP_INTEGER(ch, source); token->type = LRL_TT_Real; } } /* Check that we reached the end */ if (ch >= 128 || character_table[ch] == LRL_TT_Ident || ch == '.'){ lrl_err_char(ctx, LRL_Err_UnexpectedCharInNumber, source); } /* Store in token and sourcePtr */ token->loc.length = (source - token->loc.start); *sourcePtr = source; return; case LRL_TT_String: /* Read string */ lrl_string_parse(ctx, &source, NULL); token->type = LRL_TT_String; token->loc.length = (source - token->loc.start); *sourcePtr = source; return; case LRL_TT_Real: case LRL_TT_Undefined: case LRL_TT_None: case LRL_TT_NaN: case LRL_TT_Inf: case LRL_KW_Here: LRL_case_except_tt_values default: /* Check for two-character tokens: == >= <= != += -> etc. */ if (*source == '=') { switch (ch) { case '<': char_type = LRL_Op_LessEqual; break; case '>': char_type = LRL_Op_GreaterEqual; break; case '!': char_type = LRL_Op_NotEqual; break; case '=': char_type = LRL_Op_Equal; break; case '+': char_type = LRL_Op_PlusAssign; break; case '-': char_type = LRL_Op_MinusAssign; break; case '*': char_type = LRL_Op_TimesAssign; break; case '/': char_type = LRL_Op_DivideAssign; break; default: goto know_token_type; } source++; } else if (ch == '-' && *source == '>') { char_type = LRL_Op_FunctionMember; source++; } else if (ch == '<' && *source == '<') { char_type = LRL_Op_ShiftL; source++; if (*source == '=') { char_type = LRL_Op_ShiftLAssign; source++; } } else if (ch == '>' && *source == '>') { char_type = LRL_Op_ShiftR; source++; if (*source == '=') { char_type = LRL_Op_ShiftRAssign; source++; } } else if (ch == '+' && *source == '>') { char_type = LRL_Sym_FlexiPointer; source++; } else if (ch == '*' && *source == '>') { char_type = LRL_Sym_RawPointer; source++; } else if (ch == '+' && *source == '*' && source[1] == '>') { char_type = LRL_Sym_RawFlexiPointer; source += 2; } else if (ch == '.' && *source == '.' && source[1] == '.' && source[2] == '*') { char_type = LRL_Sym_CVarArg; source += 3; if (*source == '/') { lrl_err_char(ctx, LRL_Err_LineCommentAfterAsterisk, source); } } else if ((ch == '+' || ch == '-') && (*source == '+' || *source == '-')) { lrl_err_char(ctx, LRL_Err_DoublePlusMinus, source); source++; } /* This is a single-character token */ know_token_type: token->type = char_type; token->loc.length = source - token->loc.start; *sourcePtr = source; /* Check for illegal characters */ if (char_type == 0) { lrl_err_token(ctx, LRL_Err_IllegalCharacter, token); } if (ch == '*' && *source == '/') { lrl_err_char(ctx, LRL_Err_LineCommentAfterAsterisk, source); } return; } /* not reached */ } size_t lrl_tokenize(LRLCtx *ctx, LRLToken **tokenList, const char *source) { size_t size, capacity; LRLToken *tokens; init_list(&tokens, &size, &capacity, 1024); for (;;) { LRLTokenType type; LRLToken *current_token = &tokens[size]; /* Get next token */ tokenize_one(ctx, current_token, &source); /* Check if we reached the end */ type = current_token->type; if (type <= 0) break; /* Resize the list if needed */ if (!grow_list(&tokens, &size, &capacity)) goto error; } *tokenList = tokens; return size; error: *tokenList = NULL; return 0; }