// SPDX-License-Identifier: LGPL-3.0-or-later /** * \file lexer.h * * C-compliant non-allocating UTF-8 text lexer. * * \author Lorenzo Cogotti * \copyright The DoubleFourteen Code Forge (C) All Rights Reserved */ #ifndef DF_LEXER_H_ #define DF_LEXER_H_ #include "utf/utfdef.h" /// Maximum allowed token length inside text parsed by `Lex`. #define MAXTOKLEN 256 /// String token type #define TT_STRING U16_C(1) /// Literal token type #define TT_LITERAL U16_C(2) /// Numeric token type #define TT_NUMBER U16_C(3) /// Token type for names or identifiers #define TT_NAME U16_C(4) /// Punctuation token type #define TT_PUNCT U16_C(5) /** * Token subtype flags for `TT_NUMBER` * * @{ */ #define TT_INT BIT(0) ///< integer #define TT_DEC BIT(1) ///< decimal number #define TT_HEX BIT(2) ///< hexadecimal number #define TT_OCT BIT(3) ///< octal number #define TT_BIN BIT(4) ///< binary number #define TT_LONG BIT(5) ///< long int #define TT_LLONG BIT(6) ///< long long int #define TT_UNSIGNED BIT(7) ///< unsigned int #define TT_FLOAT BIT(8) ///< floating point number #define TT_SINGLE_PREC BIT(9) ///< float #define TT_DOUBLE_PREC BIT(10) ///< double #define TT_EXT_PREC BIT(11) ///< long double #define TT_INF BIT(12) ///< infinite 1.#INF #define TT_INDEF BIT(13) ///< indefinite 1.#IND #define TT_NAN BIT(14) ///< NaN #define TT_IPADDR BIT(15) ///< ip address (address may still be ill-formed, e.g. `102948.22.999.1`) #define TT_IPV4 BIT(16) ///< ipv4 address format #define TT_IPV6 BIT(17) ///< ipv6 address format #define TT_IPV6LIT BIT(18) ///< ipv6 address is expressed as literal (e.g. `[2001:db8:a::123]`) #define TT_IPV6ZONE BIT(19) ///< ipv6 address contains a zone index/string (e.g. `fe80::1ff:fe23:4567:890a%3`) #define TT_IPPORT BIT(20) ///< ip address includes a port /** @} */ /** * Token flags * * @{ */ /// Indicates `Tok` originally exceeded `MAXTOKLEN` and was consequently truncated. #define TT_TRUNC BIT(15) /** @} */ /// Lexer punctuation token descriptor (text -> token `subtype`). typedef struct Punctuation Punctuation; struct Punctuation { const char *p; ///< NULL for last element in punctuation list. Uint32 id; ///< Puntuation identifier (returned in `Tok->subtype`) }; // punctuation ids #define P_RSHIFT_ASSIGN 1 #define P_LSHIFT_ASSIGN 2 #define P_PARMS 3 #define P_PRECOMPMERGE 4 #define P_LOGIC_AND 5 #define P_LOGIC_OR 6 #define P_LOGIC_GEQ 7 #define P_LOGIC_LEQ 8 #define P_LOGIC_EQ 9 #define P_LOGIC_UNEQ 10 #define P_MUL_ASSIGN 11 #define P_DIV_ASSIGN 12 #define P_MOD_ASSIGN 13 #define P_ADD_ASSIGN 14 #define P_SUB_ASSIGN 15 #define P_INC 16 #define P_DEC 17 #define P_BIN_AND_ASSIGN 18 #define P_BIN_OR_ASSIGN 19 #define P_BIN_XOR_ASSIGN 20 #define P_RSHIFT 21 #define P_LSHIFT 22 #define P_POINTERREF 23 #define P_MUL 24 #define P_DIV 25 #define P_MOD 26 #define P_ADD 27 #define P_SUB 28 #define P_ASSIGN 29 #define P_BIN_AND 30 #define P_BIN_OR 31 #define P_BIN_XOR 32 #define P_BIN_NOT 33 #define P_LOGIC_NOT 34 #define P_LOGIC_GREATER 35 #define P_LOGIC_LESS 36 #define P_REF 37 #define P_COMMA 38 #define P_SEMICOLON 39 #define P_COLON 40 #define P_QUESTIONMARK 41 #define P_PARENOPEN 42 #define P_PARENCLOSE 43 #define P_BRACEOPEN 44 #define P_BRACECLOSE 45 #define P_SQBRACKETOPEN 46 #define P_SQBRACKETCLOSE 47 #define P_BACKSLASH 48 #define P_PRECOMP 49 #define P_DOLLAR 50 /** * \brief Token returned by `Lex`. * * Contains token text and information. */ typedef struct Tok Tok; struct Tok { Uint16 type; Uint16 flags; Uint32 subtype; unsigned linesCrossed; unsigned spacesBeforeToken; unsigned line; long long intvalue; double floatvalue; Tok *nextToken; char text[MAXTOKLEN]; // NOTE: last element to allow partial allocation }; /// Disregard lexer errors #define L_NOERR BIT(0) /// Disregard lexer warnings #define L_NOWARN BIT(1) /// Disregard both errors and warnings #define L_QUIET (L_NOERR | L_NOWARN) /// Use console colors when reporting errors and warnings #define L_COLORED BIT(2) /// Parse all tokens as strings, instead of breaking them using full-fledged C rules #define L_STRONLY BIT(3) /// Allow file paths within tokens #define L_ALLOWPATHS BIT(4) /// Do not allow escapes within strings #define L_NOSTRESC BIT(5) /// Do not concatenate consecutive strings #define L_NOSTRCAT BIT(6) /// Concatenate strings separated by a backslash+newline #define L_ALLOWBACKSLASHSTRCAT BIT(7) /// Allow multichar literals #define L_ALLOWMULTICHARLIT BIT(8) /// Accepts IP addresses (parsed as `TT_NUMBER`) #define L_ALLOWIPADDR BIT(9) /// IP addresses with port numbers, IPv6 literals or zone ids won't be accepted, /// only meaningful if used with `L_ALLOWIPADDR`. #define L_PLAINIPADDRONLY BIT(10) /// Allow special floating point exception tokens (0.#INF, 0.#IND). #define L_ALLOWFLOATEXC BIT(10) /// Allow truncating tokens exceeding `MAXTOKLEN`. #define L_ALLOWTRUNC BIT(11) /// Do not search base `#include` paths (used by PC library). #define L_NOBASEINCLUDES BIT(12) /// Special callback, invokes immediate program termination after reporting a lexer message #define LEX_QUIT ((void (*)(Lex *, const char *, void *)) -1) /// Special callback, makes the lexer ignore the the warning or error /// (same behavior as `L_NOERR` and `L_NOWARN`, but as an explicit callback). #define LEX_IGN ((void (*)(Lex *, const char *, void *)) 0) /// Special callback, makes the lexer print an error or warning message to `stderr`, /// doesn't terminate execution. #define LEX_WARN ((void (*)(Lex *, const char *, void *)) 1) /** * \brief A lexer, breaks text into single tokens, keeping track of the current position. * * \note This struct should be considered opaque. */ typedef struct Lex Lex; struct Lex { char *pos, *lim; unsigned line; Uint16 flags; Boolean8 hasError; Boolean8 hasBufferedToken; Rune nr; const Punctuation *puncts; void *obj; void (*Error)(Lex *, const char *, void *); void (*Warn)(Lex *, const char *, void *); Lex *nextLexer; Tok buf; char name[MAXTOKLEN]; }; /// Register callbacks for lexer warning and error triggers. FORCE_INLINE void SetLexerErrorFunc(Lex *p, void (*errf)(Lex *, const char *, void *), void (*warnf)(Lex *, const char *, void *), void *obj) { p->Error = errf; p->Warn = warnf; p->obj = obj; } /** * \brief Set parsing session name and initial line number. * * \param [out] p A lexer, must not be `NULL` * \param [in] name Name for this parsing session * \param [in] line Initial line number, 0 is implicitly changed to 1 */ void BeginLexerSession(Lex *p, const char *name, unsigned line); /** * \brief Setup lexer to parse text, sized. * * \param [out] p A lexer, must not be `NULL` * \param [in] text Text to be parsed, must have at least `n` chars * \param [in] n Number of chars in `text` */ void SetLexerTextn(Lex *p, const char *text, size_t n); /** * \brief Setup lexer to parse text. * * \param [out] p A lexer, must not be `NULL` * \param [in] text `NUL` terminated text to be parsed */ FORCE_INLINE void SetLexerText(Lex *p, const char *text) { EXTERNC size_t strlen(const char *); SetLexerTextn(p, text, strlen(text)); } /** * \brief Change lexer flags. * * \param [out] p A lexer, must not be `NULL` * \param [in] flags New flags for the lexer */ FORCE_INLINE void SetLexerFlags(Lex *p, unsigned flags) { p->flags = flags; } /// Retrieve current lexer flags. FORCE_INLINE unsigned GetLexerFlags(Lex *p) { return p->flags; } /// Trigger an error over a lexer. CHECK_PRINTF(2, 3) void LexerError(Lex *p, const char *fmt, ...); /** * Trigger a warning over a lexer. */ CHECK_PRINTF(2, 3) void LexerWarning(Lex *p, const char *fmt, ...); /// Test whether a lexer reached the end. FORCE_INLINE Boolean IsLexerEndOfFile(Lex *p) { return (p->pos >= p->lim || *p->pos == '\0') && !p->hasBufferedToken; } /// Test whether a lexer encountered an error. FORCE_INLINE Boolean HasLexerError(Lex *p) { return p->hasError; } /** * \brief Read and return next token. * * \param [in,out] p A lexer, must not be `NULL` * \param [out] dest Storage for the returned token, must not be `NULL` * * \return If a new token has been read, then `tok->text` is returned, * `NULL` is returned if a parsing error has been encountered, * or no more tokens are available. */ char *Lex_ReadToken(Lex *p, Tok *dest); /** * \brief Read and return next token in the same line. * * This is a variant of `Lex_ReadToken()` useful to implement * a C Preprocessor, it avoids parsing spanning more than one line. * `\` followed by a newline is recognized and treated as a regular * space. */ char *Lex_ReadTokenOnLine(Lex *p, Tok *dest); /** * \brief Expects an integral token, reading and returning its value. * * \param [in,out] p A lexer, must not be `NULL` * \param [in] optionalSign Allow an optional `+` or `-` sign before the * token, if set to `FALSE` only unsigned * integers are allowed. * * \return The token value, 0 on error, use `HasLexerError()` to distinguish * between actual 0 and error value. */ long long Lex_ParseInt(Lex *p, Boolean optionalSign); /** * \brief Expects a boolean token, reading and returning its value. * * \param [in,out] p A lexer, must not be `NULL` * \param [in] allowNumeric Convert numeric values to booleans, 0 for * `FALSE`, any other numeric value for `TRUE` * * \return The boolean value, `FALSE` on error or end of file, * use `HasLexerError()` or `IsLexerEndOfFile()` to distinguish * between actual `FALSE` and error value. */ Boolean Lex_ParseBool(Lex *p, Boolean allowNumeric); /** * \brief Expects a floating point token, reading and returning its value. * * \param [in,out] p A lexer, must not be `NULL` * \param [in] optionalSign Allow an optional `+` or `-` sign before the * token, if set to `FALSE` only non-negative * values are allowed. * * \return The float value, 0 on error or end of file, * use `HasLexerError()` or `IsLexerEndOfFile()` to distinguish * between actual 0 and error value. */ double Lex_ParseFloat(Lex *p, Boolean optionalSign); /** * \brief Read a one dimensional matrix (vector of length `n`) from `p` into `dest`. * * Matrix format is: * ``` * (x y z w ...) * ``` * * \return `TRUE` on success, `FALSE` on error. */ Boolean Lex_ParseMatrix1(Lex *p, float *dest, size_t n); /** * \brief `Lex_ParseMatrix1()` variant for two dimensional matrixes. * * Matrix format is: * ``` * ((x0 y0 z0 w0 ...) (x1 y1 z1 w1 ...) ...) * ``` */ Boolean Lex_ParseMatrix2(Lex *p, float *dest, size_t n, size_t m); /// `Lex_ParseMatrix1()` variant for tridimensional matrixes. Boolean Lex_ParseMatrix3(Lex *p, float *dest, size_t n, size_t m, size_t u); /// Discard any buffered token and any in text token up to a new line. void Lex_SkipLine(Lex *p); /** * \brief Skip every token until `tok` is encountered. * * \param [in,out] p A lexer, must not be `NULL` * \param [in] tok Token to look for, must not be `NULL` * * \return `tok` on success, `NULL` on error or end of file. */ char *Lex_SkipUntil(Lex *p, const char *tok); /** * \brief Expect and skip section enclosed within braces. * * Braced sections are enclosed by punctuation tokens of id `P_BRACEOPEN` and * `P_BRACECLOSE`. * * \param [in,out] p A lexer, must not be `NULL` * \param [in] parseFirstBrace Whether the function should expect the next * token to be the first brace of the section * (`TRUE`) or it should assume the first brace * has already been parsed (`FALSE`). * * \return `TRUE` if section was skipped successfully, `FALSE` on error * (either unbalanced braces or unexpected token). */ Boolean Lex_SkipBracedSection(Lex *p, Boolean parseFirstBrace); /** * \brief Expect a token, matching and returning it, raises error on mismatch. * * \param [in,out] p A lexer, must not be `NULL` * \param [in] tok Token to be expected, must not be `NULL` * * \return On success `tok` is returned, on error `NULL`. */ char *Lex_MatchToken(Lex *p, const char *tok); /** * \brief Expect any token, raises an error if none is found. * * \param [in,out] p A lexer, must not be `NULL` * \param [out] dest Storage for returned token, must not be `NULL` * * \return On success `tok->text` is returned, on error `NULL`. */ char *Lex_MatchAnyToken(Lex *p, Tok *dest); /** * \brief Expect a token of a specific `type` and `subtype`, raise error on mismatch. * * \param [in,out] p A lexer, must not be `NULL` * \param [out] dest Storage for returned token, must not be `NULL` * \param [in] type Token type to be expected * \param [in] subtype Subtype mask for the expected token * * \return On success `tok->text`, `NULL` otherwise. */ char *Lex_MatchTokenType(Lex *p, Tok *dest, int type, unsigned subtype); /** * Check whether next token matches `tok`. * * If token matches it is read from `p` and returned, as in `Lex_ReadToken()`, * otherwise `p` is left unaltered (except for parsing errors). */ char *Lex_CheckToken(Lex *p, const char *tok); /// Similar to `Lex_CheckToken()`, but matches by token `type` and `subtype`. char *Lex_CheckTokenType(Lex *, Tok *dest, int type, unsigned subtype); /** * \brief Peek next token from `p` and test whether it matches with `tok`. * * In no case next token is consumed from `p`, lexer is left unaltered * (except for parsing errors). */ char *Lex_PeekToken(Lex *p, const char *tok); /// Similar to `Lex_PeekToken()`, but matches by token `type` and `subtype`. char *Lex_PeekTokenType(Lex *p, Tok *dest, int type, unsigned subtype); /** * \brief Place a token back into the lexer. * * Only one token may be placed back into the lexer at a time, * it will be returned back on the next call to `Lex_ReadToken()`. */ void Lex_UngetToken(Lex *p, const Tok *tok); #endif