2012-04-23 00:02:42 +01:00
/* Parser combinators for binary formats.
* Copyright ( C ) 2012 Meredith L . Patterson , Dan " TQ " Hirsch
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation , version 2.
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 , USA .
*/
2012-04-23 19:39:44 +01:00
# ifndef HAMMER_HAMMER__H
# define HAMMER_HAMMER__H
2012-04-22 04:47:08 +01:00
# include <glib.h>
# include <stdint.h>
/* The state of the parser.
*
* Members :
* input - the entire string being parsed
* index - current position in input
* length - size of input
2012-05-03 02:09:00 +01:00
* cache - a hash table describing the state of the parse , including partial parse_results . It ' s a hash table from parser_cache_key_t to parse_state_t .
2012-04-22 04:47:08 +01:00
*
*/
2012-04-23 19:39:44 +01:00
# define BYTE_BIG_ENDIAN 0x1
# define BIT_BIG_ENDIAN 0x2
2012-04-29 01:45:52 +01:00
# define BIT_LITTLE_ENDIAN 0x0
# define BYTE_LITTLE_ENDIAN 0x0
2012-04-23 19:39:44 +01:00
2012-05-11 23:38:45 +01:00
typedef int bool ;
2012-04-29 01:45:52 +01:00
typedef struct input_stream {
// This should be considered to be a really big value type.
2012-04-22 04:47:08 +01:00
const uint8_t * input ;
size_t index ;
size_t length ;
2012-04-23 19:39:44 +01:00
char bit_offset ;
char endianness ;
2012-05-10 14:32:11 +01:00
char overrun ;
2012-04-29 01:45:52 +01:00
} input_stream_t ;
typedef struct parse_state {
GHashTable * cache ;
input_stream_t input_stream ;
2012-04-22 23:40:25 +01:00
} parse_state_t ;
2012-04-22 04:47:08 +01:00
2012-05-01 03:21:14 +01:00
typedef enum token_type {
TT_NONE ,
TT_BYTES ,
TT_SINT ,
TT_UINT ,
TT_SEQUENCE ,
TT_MAX
} token_type_t ;
2012-05-01 00:33:47 +01:00
typedef struct parsed_token {
2012-05-01 03:21:14 +01:00
token_type_t token_type ;
union {
struct {
const uint8_t * token ;
size_t len ;
} bytes ;
int64_t sint ;
uint64_t uint ;
2012-05-11 23:38:45 +01:00
double dbl ;
float flt ;
2012-05-01 03:21:14 +01:00
GSequence * seq ;
} ;
2012-05-01 00:33:47 +01:00
} parsed_token_t ;
2012-05-03 01:40:23 +01:00
/* If a parse fails, the parse result will be NULL.
* If a parse is successful but there ' s nothing there ( i . e . , if end_p succeeds ) then there ' s a parse result but its ast is NULL .
*/
2012-04-22 23:40:25 +01:00
typedef struct parse_result {
2012-05-01 03:21:14 +01:00
const parsed_token_t * ast ;
2012-04-22 23:40:25 +01:00
} parse_result_t ;
2012-05-12 15:49:46 +01:00
/* Type of an action to apply to an AST, used in the action() parser. */
typedef parse_result_t * ( * action_t ) ( parse_result_t * p ) ;
/* Type of a boolean attribute-checking function, used in the attr_bool() parser. */
typedef int ( * attr_bool_t ) ( void * env ) ;
2012-04-22 23:40:25 +01:00
typedef struct parser {
2012-05-01 00:33:47 +01:00
parse_result_t * ( * fn ) ( void * env , parse_state_t * state ) ;
void * env ;
2012-04-22 23:40:25 +01:00
} parser_t ;
2012-05-03 01:58:09 +01:00
parse_result_t * parse ( const parser_t * parser , const uint8_t * input , size_t length ) ;
2012-04-22 23:40:25 +01:00
2012-05-01 00:33:47 +01:00
/* Given a string, returns a parser that parses that string value. */
const parser_t * token ( const uint8_t * str , const size_t len ) ;
/* Given a single character, returns a parser that parses that character. */
2012-04-30 03:44:10 +01:00
const parser_t * ch ( const uint8_t c ) ;
2012-05-01 00:33:47 +01:00
/* Given two single-character bounds, lower and upper, returns a parser that parses a single character within the range [lower, upper] (inclusive). */
2012-04-30 03:44:10 +01:00
const parser_t * range ( const uint8_t lower , const uint8_t upper ) ;
2012-05-01 00:33:47 +01:00
2012-05-11 23:38:45 +01:00
/* Returns a parser that parses the specified number of bits. sign == true if signed, false if unsigned. */
const parser_t * bits ( size_t len , bool sign ) ;
2012-05-11 15:14:30 +01:00
/* Returns a parser that parses a signed 8-byte integer value. */
const parser_t * int64 ( ) ;
/* Returns a parser that parses a signed 4-byte integer value. */
const parser_t * int32 ( ) ;
/* Returns a parser that parses a signed 2-byte integer value. */
const parser_t * int16 ( ) ;
/* Returns a parser that parses a signed 1-byte integer value. */
const parser_t * int8 ( ) ;
/* Returns a parser that parses an unsigned 8-byte integer value. */
const parser_t * uint64 ( ) ;
/* Returns a parser that parses an unsigned 4-byte integer value. */
const parser_t * uint32 ( ) ;
/* Returns a parser that parses an unsigned 2-byte integer value. */
const parser_t * uint16 ( ) ;
/* Returns a parser that parses an unsigned 1-byte integer value. */
const parser_t * uint8 ( ) ;
/* Returns a parser that parses a double-precision floating-point value. */
const parser_t * float64 ( ) ;
/* Returns a parser that parses a single-precision floating-point value. */
const parser_t * float32 ( ) ;
2012-05-01 00:33:47 +01:00
/* Given another parser, p, returns a parser that skips any whitespace and then applies p. */
2012-04-30 03:44:10 +01:00
const parser_t * whitespace ( const parser_t * p ) ;
2012-05-01 00:33:47 +01:00
/* Given another parser, p, and a function f, returns a parser that applies p, then applies f to everything in the AST of p's result. */
2012-05-12 15:49:46 +01:00
const parser_t * action ( const parser_t * p , const action_t a ) ;
2012-05-01 00:33:47 +01:00
2012-05-04 21:25:26 +01:00
/* Parse a single character *NOT* in charset */
2012-05-11 23:42:21 +01:00
const parser_t * not_in ( const uint8_t * charset , int length ) ;
2012-05-01 00:33:47 +01:00
/* A no-argument parser that succeeds if there is no more input to parse. */
2012-04-30 03:44:10 +01:00
const parser_t * end_p ( ) ;
2012-05-01 00:33:47 +01:00
/* This parser always fails. */
2012-04-30 03:44:10 +01:00
const parser_t * nothing_p ( ) ;
2012-05-01 00:45:46 +01:00
2012-05-12 00:40:54 +01:00
/* Given an null-terminated list of parsers, apply each parser in order. The parse succeeds only if all parsers succeed. */
const parser_t * sequence ( const parser_t * p , . . . ) __attribute__ ( ( sentinel ) ) ;
2012-05-01 03:59:49 +01:00
/* Given an array of parsers, p_array, apply each parser in order. The first parser to succeed is the result; if no parsers succeed, the parse fails. */
2012-05-12 00:40:54 +01:00
const parser_t * choice ( const parser_t * p , . . . ) __attribute__ ( ( sentinel ) ) ;
2012-05-01 03:59:49 +01:00
2012-05-03 02:31:22 +01:00
/* Given two parsers, p1 and p2, this parser succeeds in the following cases:
* - if p1 succeeds and p2 fails
2012-05-12 00:24:56 +01:00
* - if both succeed but p1 ' s result is as long as or shorter than p2 ' s
2012-05-03 02:31:22 +01:00
*/
2012-04-30 03:44:10 +01:00
const parser_t * butnot ( const parser_t * p1 , const parser_t * p2 ) ;
2012-05-03 02:31:22 +01:00
/* Given two parsers, p1 and p2, this parser succeeds in the following cases:
* - if p1 succeeds and p2 fails
* - if both succeed but p2 ' s result is shorter than p1 ' s
*/
2012-04-30 03:44:10 +01:00
const parser_t * difference ( const parser_t * p1 , const parser_t * p2 ) ;
2012-05-03 02:31:22 +01:00
/* Given two parsers, p1 and p2, this parser succeeds if *either* p1 or p2 succeed, but not if they both do.
*/
2012-04-30 03:44:10 +01:00
const parser_t * xor ( const parser_t * p1 , const parser_t * p2 ) ;
2012-05-03 02:31:22 +01:00
2012-05-11 23:38:45 +01:00
/* Given a parser, p, this parser succeeds for zero or more repetitions of p. */
2012-04-30 03:44:10 +01:00
const parser_t * repeat0 ( const parser_t * p ) ;
2012-05-11 23:38:45 +01:00
/* Given a parser, p, this parser succeeds for one or more repetitions of p. */
2012-04-30 03:44:10 +01:00
const parser_t * repeat1 ( const parser_t * p ) ;
2012-05-11 23:38:45 +01:00
/* Given a parser, p, this parser succeeds for exactly N repetitions of p. */
2012-04-30 03:44:10 +01:00
const parser_t * repeat_n ( const parser_t * p , const size_t n ) ;
2012-05-11 23:38:45 +01:00
/* Given a parser, p, this parser succeeds with the value p parsed or with an empty result. */
2012-04-30 03:44:10 +01:00
const parser_t * optional ( const parser_t * p ) ;
2012-05-11 23:38:45 +01:00
/* Given a parser, p, this parser succeeds if p succeeds, but doesn't include p's result in the result. */
const parser_t * ignore ( const parser_t * p ) ;
2012-05-12 15:49:46 +01:00
/* Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep.
* For example , if p is repeat1 ( range ( ' 0 ' , ' 9 ' ) ) and sep is ch ( ' , ' ) , list ( p , sep ) will match a comma - separated list of integers .
*/
const parser_t * list ( const parser_t * p , const parser_t * sep ) ;
/* This parser always returns a zero length match, i.e., empty string. */
2012-04-30 03:44:10 +01:00
const parser_t * epsilon_p ( ) ;
2012-05-12 15:49:46 +01:00
/* This parser attaches an attribute function, which returns true or false, to a parser. The function is evaluated over the parser's result AST.
* The parse only succeeds if the attribute function returns true .
*/
const parser_t * attr_bool ( const parser_t * p , const attr_bool_t a ) ;
/* The 'and' parser is a predicate. It asserts that a conditional syntax is satisfied, but consumes no input.
* This is useful for lookahead . As an example :
*
* Suppose you already have a parser , hex_p , that parses numbers in hexadecimal format ( including the leading ' 0 x ' ) . Then
* sequence ( and ( token ( ( const uint8_t * ) " 0x " , 2 ) ) , hex_p )
* checks to see whether there is a leading " 0x " , * does not * consume the " 0x " , and then applies hex_p to parse the hex - formatted number .
*
* ' and ' succeeds if p succeeds , and fails if p fails . Like ' ignore ' , ' and ' does not attach a result to the AST .
*/
2012-04-30 03:44:10 +01:00
const parser_t * and ( const parser_t * p ) ;
2012-05-12 15:49:46 +01:00
/* The 'not' parser is a predicate. It asserts that a conditional syntax is *not* satisfied, and consumes no input.
* As a somewhat contrived example :
*
* Since ' choice ' applies its arguments in order , the following parser :
* sequence ( ch ( ' a ' ) , choice ( ch ( ' + ' ) , token ( ( const uint8_t * ) " ++ " ) , NULL ) , ch ( ' b ' ) , NULL )
* will not parse " a++b " , because once choice ( ) has succeeded , it will not backtrack and try other alternatives if a later parser in the sequence
* fails .
* Instead , you can force the use of the second alternative by turning the ch ( ' + ' ) alternative into a sequence with not :
* sequence ( ch ( ' a ' ) , choice ( sequence ( ch ( ' + ' ) , not ( ch ( ' + ' ) ) , NULL ) , token ( ( const uint8_t * ) " ++ " ) ) , ch ( ' b ' ) , NULL )
* If the input string is " a+b " , the first alternative is applied ; if the input string is " a++b " , the second alternative is applied .
*/
2012-04-30 03:44:10 +01:00
const parser_t * not ( const parser_t * p ) ;
2012-04-22 15:30:49 +01:00
2012-04-23 19:39:44 +01:00
# endif // #ifndef HAMMER_HAMMER__H