You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

476 lines
14 KiB
C

// SPDX-License-Identifier: LGPL-3.0-or-later
/**
* \file lexer.h
*
* C-compliant non-allocating UTF-8 text lexer.
*
* \author Lorenzo Cogotti
* \copyright The DoubleFourteen Code Forge (C) All Rights Reserved
*/
#ifndef DF_LEXER_H_
#define DF_LEXER_H_
#include "utf/utfdef.h"
/// Maximum allowed token length inside text parsed by `Lex`.
#define MAXTOKLEN 256
/// String token type
#define TT_STRING U16_C(1)
/// Literal token type
#define TT_LITERAL U16_C(2)
/// Numeric token type
#define TT_NUMBER U16_C(3)
/// Token type for names or identifiers
#define TT_NAME U16_C(4)
/// Punctuation token type
#define TT_PUNCT U16_C(5)
/**
* Token subtype flags for `TT_NUMBER`
*
* @{
*/
#define TT_INT BIT(0) ///< integer
#define TT_DEC BIT(1) ///< decimal number
#define TT_HEX BIT(2) ///< hexadecimal number
#define TT_OCT BIT(3) ///< octal number
#define TT_BIN BIT(4) ///< binary number
#define TT_LONG BIT(5) ///< long int
#define TT_LLONG BIT(6) ///< long long int
#define TT_UNSIGNED BIT(7) ///< unsigned int
#define TT_FLOAT BIT(8) ///< floating point number
#define TT_SINGLE_PREC BIT(9) ///< float
#define TT_DOUBLE_PREC BIT(10) ///< double
#define TT_EXT_PREC BIT(11) ///< long double
#define TT_INF BIT(12) ///< infinite 1.#INF
#define TT_INDEF BIT(13) ///< indefinite 1.#IND
#define TT_NAN BIT(14) ///< NaN
#define TT_IPADDR BIT(15) ///< ip address (address may still be ill-formed, e.g. `102948.22.999.1`)
#define TT_IPV4 BIT(16) ///< ipv4 address format
#define TT_IPV6 BIT(17) ///< ipv6 address format
#define TT_IPV6LIT BIT(18) ///< ipv6 address is expressed as literal (e.g. `[2001:db8:a::123]`)
#define TT_IPV6ZONE BIT(19) ///< ipv6 address contains a zone index/string (e.g. `fe80::1ff:fe23:4567:890a%3`)
#define TT_IPPORT BIT(20) ///< ip address includes a port
/** @} */
/**
* Token flags
*
* @{
*/
/// Indicates `Tok` originally exceeded `MAXTOKLEN` and was consequently truncated.
#define TT_TRUNC BIT(15)
/** @} */
/// Lexer punctuation token descriptor (text -> token `subtype`).
typedef struct Punctuation Punctuation;
struct Punctuation {
const char *p; ///< NULL for last element in punctuation list.
Uint32 id; ///< Puntuation identifier (returned in `Tok->subtype`)
};
// punctuation ids
#define P_RSHIFT_ASSIGN 1
#define P_LSHIFT_ASSIGN 2
#define P_PARMS 3
#define P_PRECOMPMERGE 4
#define P_LOGIC_AND 5
#define P_LOGIC_OR 6
#define P_LOGIC_GEQ 7
#define P_LOGIC_LEQ 8
#define P_LOGIC_EQ 9
#define P_LOGIC_UNEQ 10
#define P_MUL_ASSIGN 11
#define P_DIV_ASSIGN 12
#define P_MOD_ASSIGN 13
#define P_ADD_ASSIGN 14
#define P_SUB_ASSIGN 15
#define P_INC 16
#define P_DEC 17
#define P_BIN_AND_ASSIGN 18
#define P_BIN_OR_ASSIGN 19
#define P_BIN_XOR_ASSIGN 20
#define P_RSHIFT 21
#define P_LSHIFT 22
#define P_POINTERREF 23
#define P_MUL 24
#define P_DIV 25
#define P_MOD 26
#define P_ADD 27
#define P_SUB 28
#define P_ASSIGN 29
#define P_BIN_AND 30
#define P_BIN_OR 31
#define P_BIN_XOR 32
#define P_BIN_NOT 33
#define P_LOGIC_NOT 34
#define P_LOGIC_GREATER 35
#define P_LOGIC_LESS 36
#define P_REF 37
#define P_COMMA 38
#define P_SEMICOLON 39
#define P_COLON 40
#define P_QUESTIONMARK 41
#define P_PARENOPEN 42
#define P_PARENCLOSE 43
#define P_BRACEOPEN 44
#define P_BRACECLOSE 45
#define P_SQBRACKETOPEN 46
#define P_SQBRACKETCLOSE 47
#define P_BACKSLASH 48
#define P_PRECOMP 49
#define P_DOLLAR 50
/**
* \brief Token returned by `Lex`.
*
* Contains token text and information.
*/
typedef struct Tok Tok;
struct Tok {
Uint16 type;
Uint16 flags;
Uint32 subtype;
unsigned linesCrossed;
unsigned spacesBeforeToken;
unsigned line;
long long intvalue;
double floatvalue;
Tok *nextToken;
char text[MAXTOKLEN]; // NOTE: last element to allow partial allocation
};
/// Disregard lexer errors
#define L_NOERR BIT(0)
/// Disregard lexer warnings
#define L_NOWARN BIT(1)
/// Disregard both errors and warnings
#define L_QUIET (L_NOERR | L_NOWARN)
/// Use console colors when reporting errors and warnings
#define L_COLORED BIT(2)
/// Parse all tokens as strings, instead of breaking them using full-fledged C rules
#define L_STRONLY BIT(3)
/// Allow file paths within tokens
#define L_ALLOWPATHS BIT(4)
/// Do not allow escapes within strings
#define L_NOSTRESC BIT(5)
/// Do not concatenate consecutive strings
#define L_NOSTRCAT BIT(6)
/// Concatenate strings separated by a backslash+newline
#define L_ALLOWBACKSLASHSTRCAT BIT(7)
/// Allow multichar literals
#define L_ALLOWMULTICHARLIT BIT(8)
/// Accepts IP addresses (parsed as `TT_NUMBER`)
#define L_ALLOWIPADDR BIT(9)
/// IP addresses with port numbers, IPv6 literals or zone ids won't be accepted,
/// only meaningful if used with `L_ALLOWIPADDR`.
#define L_PLAINIPADDRONLY BIT(10)
/// Allow special floating point exception tokens (0.#INF, 0.#IND).
#define L_ALLOWFLOATEXC BIT(10)
/// Allow truncating tokens exceeding `MAXTOKLEN`.
#define L_ALLOWTRUNC BIT(11)
/// Do not search base `#include` paths (used by PC library).
#define L_NOBASEINCLUDES BIT(12)
/// Special callback, invokes immediate program termination after reporting a lexer message
#define LEX_QUIT ((void (*)(Lex *, const char *, void *)) -1)
/// Special callback, makes the lexer ignore the the warning or error
/// (same behavior as `L_NOERR` and `L_NOWARN`, but as an explicit callback).
#define LEX_IGN ((void (*)(Lex *, const char *, void *)) 0)
/// Special callback, makes the lexer print an error or warning message to `stderr`,
/// doesn't terminate execution.
#define LEX_WARN ((void (*)(Lex *, const char *, void *)) 1)
/**
* \brief A lexer, breaks text into single tokens, keeping track of the current position.
*
* \note This struct should be considered opaque.
*/
typedef struct Lex Lex;
struct Lex {
char *pos, *lim;
unsigned line;
Uint16 flags;
Boolean8 hasError;
Boolean8 hasBufferedToken;
Rune nr;
const Punctuation *puncts;
void *obj;
void (*Error)(Lex *, const char *, void *);
void (*Warn)(Lex *, const char *, void *);
Lex *nextLexer;
Tok buf;
char name[MAXTOKLEN];
};
/// Register callbacks for lexer warning and error triggers.
FORCE_INLINE void SetLexerErrorFunc(Lex *p,
void (*errf)(Lex *, const char *, void *),
void (*warnf)(Lex *, const char *, void *),
void *obj)
{
p->Error = errf;
p->Warn = warnf;
p->obj = obj;
}
/**
* \brief Set parsing session name and initial line number.
*
* \param [out] p A lexer, must not be `NULL`
* \param [in] name Name for this parsing session
* \param [in] line Initial line number, 0 is implicitly changed to 1
*/
void BeginLexerSession(Lex *p, const char *name, unsigned line);
/**
* \brief Setup lexer to parse text, sized.
*
* \param [out] p A lexer, must not be `NULL`
* \param [in] text Text to be parsed, must have at least `n` chars
* \param [in] n Number of chars in `text`
*/
void SetLexerTextn(Lex *p, const char *text, size_t n);
/**
* \brief Setup lexer to parse text.
*
* \param [out] p A lexer, must not be `NULL`
* \param [in] text `NUL` terminated text to be parsed
*/
FORCE_INLINE void SetLexerText(Lex *p, const char *text)
{
EXTERNC size_t strlen(const char *);
SetLexerTextn(p, text, strlen(text));
}
/**
* \brief Change lexer flags.
*
* \param [out] p A lexer, must not be `NULL`
* \param [in] flags New flags for the lexer
*/
FORCE_INLINE void SetLexerFlags(Lex *p, unsigned flags)
{
p->flags = flags;
}
/// Retrieve current lexer flags.
FORCE_INLINE unsigned GetLexerFlags(Lex *p)
{
return p->flags;
}
/// Trigger an error over a lexer.
CHECK_PRINTF(2, 3) void LexerError(Lex *p, const char *fmt, ...);
/**
* Trigger a warning over a lexer.
*/
CHECK_PRINTF(2, 3) void LexerWarning(Lex *p, const char *fmt, ...);
/// Test whether a lexer reached the end.
FORCE_INLINE Boolean IsLexerEndOfFile(Lex *p)
{
return (p->pos >= p->lim || *p->pos == '\0') && !p->hasBufferedToken;
}
/// Test whether a lexer encountered an error.
FORCE_INLINE Boolean HasLexerError(Lex *p)
{
return p->hasError;
}
/**
* \brief Read and return next token.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [out] dest Storage for the returned token, must not be `NULL`
*
* \return If a new token has been read, then `tok->text` is returned,
* `NULL` is returned if a parsing error has been encountered,
* or no more tokens are available.
*/
char *Lex_ReadToken(Lex *p, Tok *dest);
/**
* \brief Read and return next token in the same line.
*
* This is a variant of `Lex_ReadToken()` useful to implement
* a C Preprocessor, it avoids parsing spanning more than one line.
* `\` followed by a newline is recognized and treated as a regular
* space.
*/
char *Lex_ReadTokenOnLine(Lex *p, Tok *dest);
/**
* \brief Expects an integral token, reading and returning its value.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [in] optionalSign Allow an optional `+` or `-` sign before the
* token, if set to `FALSE` only unsigned
* integers are allowed.
*
* \return The token value, 0 on error, use `HasLexerError()` to distinguish
* between actual 0 and error value.
*/
long long Lex_ParseInt(Lex *p, Boolean optionalSign);
/**
* \brief Expects a boolean token, reading and returning its value.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [in] allowNumeric Convert numeric values to booleans, 0 for
* `FALSE`, any other numeric value for `TRUE`
*
* \return The boolean value, `FALSE` on error or end of file,
* use `HasLexerError()` or `IsLexerEndOfFile()` to distinguish
* between actual `FALSE` and error value.
*/
Boolean Lex_ParseBool(Lex *p, Boolean allowNumeric);
/**
* \brief Expects a floating point token, reading and returning its value.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [in] optionalSign Allow an optional `+` or `-` sign before the
* token, if set to `FALSE` only non-negative
* values are allowed.
*
* \return The float value, 0 on error or end of file,
* use `HasLexerError()` or `IsLexerEndOfFile()` to distinguish
* between actual 0 and error value.
*/
double Lex_ParseFloat(Lex *p, Boolean optionalSign);
/**
* \brief Read a one dimensional matrix (vector of length `n`) from `p` into `dest`.
*
* Matrix format is:
* ```
* (x y z w ...)
* ```
*
* \return `TRUE` on success, `FALSE` on error.
*/
Boolean Lex_ParseMatrix1(Lex *p, float *dest, size_t n);
/**
* \brief `Lex_ParseMatrix1()` variant for two dimensional matrixes.
*
* Matrix format is:
* ```
* ((x0 y0 z0 w0 ...) (x1 y1 z1 w1 ...) ...)
* ```
*/
Boolean Lex_ParseMatrix2(Lex *p, float *dest, size_t n, size_t m);
/// `Lex_ParseMatrix1()` variant for tridimensional matrixes.
Boolean Lex_ParseMatrix3(Lex *p, float *dest, size_t n, size_t m, size_t u);
/// Discard any buffered token and any in text token up to a new line.
void Lex_SkipLine(Lex *p);
/**
* \brief Skip every token until `tok` is encountered.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [in] tok Token to look for, must not be `NULL`
*
* \return `tok` on success, `NULL` on error or end of file.
*/
char *Lex_SkipUntil(Lex *p, const char *tok);
/**
* \brief Expect and skip section enclosed within braces.
*
* Braced sections are enclosed by punctuation tokens of id `P_BRACEOPEN` and
* `P_BRACECLOSE`.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [in] parseFirstBrace Whether the function should expect the next
* token to be the first brace of the section
* (`TRUE`) or it should assume the first brace
* has already been parsed (`FALSE`).
*
* \return `TRUE` if section was skipped successfully, `FALSE` on error
* (either unbalanced braces or unexpected token).
*/
Boolean Lex_SkipBracedSection(Lex *p, Boolean parseFirstBrace);
/**
* \brief Expect a token, matching and returning it, raises error on mismatch.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [in] tok Token to be expected, must not be `NULL`
*
* \return On success `tok` is returned, on error `NULL`.
*/
char *Lex_MatchToken(Lex *p, const char *tok);
/**
* \brief Expect any token, raises an error if none is found.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [out] dest Storage for returned token, must not be `NULL`
*
* \return On success `tok->text` is returned, on error `NULL`.
*/
char *Lex_MatchAnyToken(Lex *p, Tok *dest);
/**
* \brief Expect a token of a specific `type` and `subtype`, raise error on mismatch.
*
* \param [in,out] p A lexer, must not be `NULL`
* \param [out] dest Storage for returned token, must not be `NULL`
* \param [in] type Token type to be expected
* \param [in] subtype Subtype mask for the expected token
*
* \return On success `tok->text`, `NULL` otherwise.
*/
char *Lex_MatchTokenType(Lex *p, Tok *dest, int type, unsigned subtype);
/**
* Check whether next token matches `tok`.
*
* If token matches it is read from `p` and returned, as in `Lex_ReadToken()`,
* otherwise `p` is left unaltered (except for parsing errors).
*/
char *Lex_CheckToken(Lex *p, const char *tok);
/// Similar to `Lex_CheckToken()`, but matches by token `type` and `subtype`.
char *Lex_CheckTokenType(Lex *, Tok *dest, int type, unsigned subtype);
/**
* \brief Peek next token from `p` and test whether it matches with `tok`.
*
* In no case next token is consumed from `p`, lexer is left unaltered
* (except for parsing errors).
*/
char *Lex_PeekToken(Lex *p, const char *tok);
/// Similar to `Lex_PeekToken()`, but matches by token `type` and `subtype`.
char *Lex_PeekTokenType(Lex *p, Tok *dest, int type, unsigned subtype);
/**
* \brief Place a token back into the lexer.
*
* Only one token may be placed back into the lexer at a time,
* it will be returned back on the next call to `Lex_ReadToken()`.
*/
void Lex_UngetToken(Lex *p, const Tok *tok);
#endif