/* Parser combinators for binary formats. * Copyright (C) 2012 Meredith L. Patterson, Dan "TQ" Hirsch * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation, version 2. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef HAMMER_HAMMER__H #define HAMMER_HAMMER__H #include #include #include "allocator.h" #define BYTE_BIG_ENDIAN 0x1 #define BIT_BIG_ENDIAN 0x2 #define BIT_LITTLE_ENDIAN 0x0 #define BYTE_LITTLE_ENDIAN 0x0 typedef int bool; typedef struct parse_state parse_state_t; typedef enum token_type { TT_NONE, TT_BYTES, TT_SINT, TT_UINT, TT_SEQUENCE, TT_USER = 64, TT_ERR, TT_MAX } token_type_t; typedef struct counted_array { size_t capacity; size_t used; arena_t arena; void **elements; } counted_array_t; typedef struct parsed_token { token_type_t token_type; union { struct { const uint8_t *token; size_t len; } bytes; int64_t sint; uint64_t uint; double dbl; float flt; counted_array_t *seq; // a sequence of parsed_token_t's void *user; }; size_t index; char bit_offset; } parsed_token_t; /** * The result of a successful parse. * If a parse fails, the parse result will be NULL. * If a parse is successful but there's nothing there (i.e., if end_p * succeeds) then there's a parse result but its ast is NULL. */ typedef struct parse_result { const parsed_token_t *ast; arena_t arena; } parse_result_t; /** * Type of an action to apply to an AST, used in the action() parser. * It can be any (user-defined) function that takes a parse_result_t* * and returns a parsed_token_t*. (This is so that the user doesn't * have to worry about memory allocation; action() does that for you.) * Note that the tagged union in parsed_token_t* supports user-defined * types, so you can create your own token types (corresponding to, * say, structs) and stuff values for them into the void* in the * tagged union in parsed_token_t. */ typedef parsed_token_t* (*action_t)(parse_result_t *p); /** * Type of a boolean attribute-checking function, used in the * attr_bool() parser. It can be any (user-defined) function that takes * a parse_result_t* and returns true or false. */ typedef bool (*predicate_t)(parse_result_t *p); typedef struct parser { parse_result_t* (*fn)(void *env, parse_state_t *state); void *env; } parser_t; /** * Top-level function to call a parser that has been built over some * piece of input (of known size). */ parse_result_t* parse(const parser_t* parser, const uint8_t* input, size_t length); /** * Given a string, returns a parser that parses that string value. */ const parser_t* token(const uint8_t *str, const size_t len); /** * Given a single character, returns a parser that parses that * character. */ const parser_t* ch(const uint8_t c); /** * Given two single-character bounds, lower and upper, returns a parser * that parses a single character within the range [lower, upper] * (inclusive). */ const parser_t* range(const uint8_t lower, const uint8_t upper); /** * Returns a parser that parses the specified number of bits. sign == * true if signed, false if unsigned. */ const parser_t* bits(size_t len, bool sign); /** * Returns a parser that parses a signed 8-byte integer value. */ const parser_t* int64(); /** * Returns a parser that parses a signed 4-byte integer value. */ const parser_t* int32(); /** * Returns a parser that parses a signed 2-byte integer value. */ const parser_t* int16(); /** * Returns a parser that parses a signed 1-byte integer value. */ const parser_t* int8(); /** * Returns a parser that parses an unsigned 8-byte integer value. */ const parser_t* uint64(); /** * Returns a parser that parses an unsigned 4-byte integer value. */ const parser_t* uint32(); /** * Returns a parser that parses an unsigned 2-byte integer value. */ const parser_t* uint16(); /** * Returns a parser that parses an unsigned 1-byte integer value. */ const parser_t* uint8(); /** * Returns a parser that parses a double-precision floating-point * value. */ const parser_t* float64(); /** * Returns a parser that parses a single-precision floating-point * value. */ const parser_t* float32(); /** * Given another parser, p, returns a parser that skips any whitespace * and then applies p. */ const parser_t* whitespace(const parser_t* p); /** * Given another parser, p, and a function f, returns a parser that * applies p, then applies f to everything in the AST of p's result. */ const parser_t* action(const parser_t* p, const action_t a); /** * Parse a single character *NOT* in the given charset. */ const parser_t* not_in(const uint8_t *charset, int length); /** * A no-argument parser that succeeds if there is no more input to * parse. */ const parser_t* end_p(); /** * This parser always fails. */ const parser_t* nothing_p(); /** * Given a null-terminated list of parsers, apply each parser in order. * The parse succeeds only if all parsers succeed. */ const parser_t* sequence(const parser_t* p, ...) __attribute__((sentinel)); /** * Given an array of parsers, p_array, apply each parser in order. The * first parser to succeed is the result; if no parsers succeed, the * parse fails. */ const parser_t* choice(const parser_t* p, ...) __attribute__((sentinel)); /** * Given two parsers, p1 and p2, this parser succeeds in the following * cases: * - if p1 succeeds and p2 fails * - if both succeed but p1's result is as long as or shorter than p2's */ const parser_t* butnot(const parser_t* p1, const parser_t* p2); /** * Given two parsers, p1 and p2, this parser succeeds in the following * cases: * - if p1 succeeds and p2 fails * - if both succeed but p2's result is shorter than p1's */ const parser_t* difference(const parser_t* p1, const parser_t* p2); /** * Given two parsers, p1 and p2, this parser succeeds if *either* p1 or * p2 succeed, but not if they both do. */ const parser_t* xor(const parser_t* p1, const parser_t* p2); /** * Given a parser, p, this parser succeeds for zero or more repetitions * of p. */ const parser_t* many(const parser_t* p); /** * Given a parser, p, this parser succeeds for one or more repetitions * of p. */ const parser_t* many1(const parser_t* p); /** * Given a parser, p, this parser succeeds for exactly N repetitions * of p. */ const parser_t* repeat_n(const parser_t* p, const size_t n); /** * Given a parser, p, this parser succeeds with the value p parsed or * with an empty result. */ const parser_t* optional(const parser_t* p); /** * Given a parser, p, this parser succeeds if p succeeds, but doesn't * include p's result in the result. */ const parser_t* ignore(const parser_t* p); /** * Given a parser, p, and a parser for a separator, sep, this parser * matches a (possibly empty) list of things that p can parse, * separated by sep. * For example, if p is repeat1(range('0','9')) and sep is ch(','), * sepBy(p, sep) will match a comma-separated list of integers. */ const parser_t* sepBy(const parser_t* p, const parser_t* sep); /** * Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy1(p, sep) will match a comma-separated list of integers. */ const parser_t* sepBy1(const parser_t* p, const parser_t* sep); /** * This parser always returns a zero length match, i.e., empty string. */ const parser_t* epsilon_p(); /** * This parser attaches a predicate function, which returns true or * false, to a parser. The function is evaluated over the parser's * result. * The parse only succeeds if the attribute function returns true. */ const parser_t* attr_bool(const parser_t* p, predicate_t pred); /** * The 'and' parser is a predicate. It asserts that a conditional * syntax is satisfied, but consumes no input. * This is useful for lookahead. As an example: * * Suppose you already have a parser, hex_p, that parses numbers in * hexadecimal format (including the leading '0x'). Then * sequence(and(token((const uint8_t*)"0x", 2)), hex_p) * checks to see whether there is a leading "0x", *does not* consume * the "0x", and then applies hex_p to parse the hex-formatted number. * * 'and' succeeds if p succeeds, and fails if p fails. Like 'ignore', * 'and' does not attach a result to the AST. */ const parser_t* and(const parser_t* p); /** * The 'not' parser is a predicate. It asserts that a conditional * syntax is *not* satisfied, and consumes no input. * As a somewhat contrived example: * * Since 'choice' applies its arguments in order, the following parser: * sequence(ch('a'), choice(ch('+'), token((const uint8_t*)"++"), NULL), ch('b'), NULL) * will not parse "a++b", because once choice() has succeeded, it will * not backtrack and try other alternatives if a later parser in the * sequence fails. * Instead, you can force the use of the second alternative by turning * the ch('+') alternative into a sequence with not: * sequence(ch('a'), choice(sequence(ch('+'), not(ch('+')), NULL), token((const uint8_t*)"++")), ch('b'), NULL) * If the input string is "a+b", the first alternative is applied; if * the input string is "a++b", the second alternative is applied. */ const parser_t* not(const parser_t* p); /** * Create a parser that just calls out to another, as yet unknown, * parser. * Note that the inner parser gets bound later, with bind_indirect. * This can be used to create recursive parsers. */ parser_t *indirect(); /** * Set the inner parser of an indirect. See comments on indirect for * details. */ void bind_indirect(parser_t* indirect, parser_t* inner); #endif // #ifndef HAMMER_HAMMER__H