From 3d5e9399c412a0a9191847b949403da0921bd3d7 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Fri, 18 May 2012 11:44:38 +0200 Subject: [PATCH 01/12] Merged TQ's changes. Started on attr_bool and action. --- src/hammer.c | 11 ++++++++++- src/hammer.h | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/hammer.c b/src/hammer.c index fb3d62a..6f60c95 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -744,7 +744,6 @@ const parser_t* epsilon_p() { return res; } - static parse_result_t* parse_indirect(void* env, parse_state_t* state) { return do_parse(env, state); } @@ -759,7 +758,17 @@ parser_t* indirect() { return res; } +typedef struct { + predicate_t pred; +} attr_bool_t; + +static parse_result_t* parse_attr_bool(void *env, parse_state_t *state) { + + +} + const parser_t* attr_bool(const parser_t* p, attr_bool_t a) { return &unimplemented; } + const parser_t* and(const parser_t* p) { return &unimplemented; } static parse_result_t* parse_not(void* env, parse_state_t* state) { diff --git a/src/hammer.h b/src/hammer.h index c3c2cf7..915f7ac 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -78,7 +78,7 @@ typedef struct parse_result { typedef parse_result_t* (*action_t)(parse_result_t *p); /* Type of a boolean attribute-checking function, used in the attr_bool() parser. */ -typedef int (*attr_bool_t)(void *env); +typedef int (*predicate_t)(parse_result_t *p); typedef struct parser { parse_result_t* (*fn)(void *env, parse_state_t *state); From b10a3d8ae9d14140c59f7bd9baa73b250b5d29a0 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Fri, 18 May 2012 12:18:19 +0200 Subject: [PATCH 02/12] Finished attr_bool, cleaned up header a little. --- src/hammer.c | 22 +++++- src/hammer.h | 205 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 175 insertions(+), 52 deletions(-) diff --git a/src/hammer.c b/src/hammer.c index 6f60c95..178eeb7 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -759,15 +759,31 @@ parser_t* indirect() { } typedef struct { + const parser_t *p; predicate_t pred; } attr_bool_t; static parse_result_t* parse_attr_bool(void *env, parse_state_t *state) { - - + attr_bool_t *a = (attr_bool_t*)env; + parse_result_t *res = do_parse(a->p, state); + if (res) { + if (a->pred(res)) + return res; + else + return NULL; + } else + return NULL; } -const parser_t* attr_bool(const parser_t* p, attr_bool_t a) { return &unimplemented; } +const parser_t* attr_bool(const parser_t* p, predicate_t pred) { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_attr_bool; + attr_bool_t *env = g_new(attr_bool_t, 1); + env->p = p; + env->pred = pred; + res->env = (void*)env; + return res; +} const parser_t* and(const parser_t* p) { return &unimplemented; } diff --git a/src/hammer.h b/src/hammer.h index 915f7ac..c16dd49 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -74,164 +74,271 @@ typedef struct parse_result { arena_t arena; } parse_result_t; -/* Type of an action to apply to an AST, used in the action() parser. */ +/** + * Type of an action to apply to an AST, used in the action() parser. + */ typedef parse_result_t* (*action_t)(parse_result_t *p); -/* Type of a boolean attribute-checking function, used in the attr_bool() parser. */ -typedef int (*predicate_t)(parse_result_t *p); +/** + * Type of a boolean attribute-checking function, used in the + * attr_bool() parser. It can be any (user-defined) function that takes + * a parse_result_t and returns true or false. + */ +typedef bool (*predicate_t)(parse_result_t *p); typedef struct parser { parse_result_t* (*fn)(void *env, parse_state_t *state); void *env; } parser_t; +/** + * Top-level function to call a parser that has been built over some + * piece of input (of known size). + */ parse_result_t* parse(const parser_t* parser, const uint8_t* input, size_t length); -/* Given a string, returns a parser that parses that string value. */ +/** + * Given a string, returns a parser that parses that string value. + */ const parser_t* token(const uint8_t *str, const size_t len); -/* Given a single character, returns a parser that parses that character. */ +/** + * Given a single character, returns a parser that parses that + * character. + */ const parser_t* ch(const uint8_t c); -/* Given two single-character bounds, lower and upper, returns a parser that parses a single character within the range [lower, upper] (inclusive). */ +/** + * Given two single-character bounds, lower and upper, returns a parser + * that parses a single character within the range [lower, upper] + * (inclusive). + */ const parser_t* range(const uint8_t lower, const uint8_t upper); -/* Returns a parser that parses the specified number of bits. sign == true if signed, false if unsigned. */ +/** + * Returns a parser that parses the specified number of bits. sign == + * true if signed, false if unsigned. + */ const parser_t* bits(size_t len, bool sign); -/* Returns a parser that parses a signed 8-byte integer value. */ +/** + * Returns a parser that parses a signed 8-byte integer value. + */ const parser_t* int64(); -/* Returns a parser that parses a signed 4-byte integer value. */ +/** + * Returns a parser that parses a signed 4-byte integer value. + */ const parser_t* int32(); -/* Returns a parser that parses a signed 2-byte integer value. */ +/** + * Returns a parser that parses a signed 2-byte integer value. + */ const parser_t* int16(); -/* Returns a parser that parses a signed 1-byte integer value. */ +/** + * Returns a parser that parses a signed 1-byte integer value. + */ const parser_t* int8(); -/* Returns a parser that parses an unsigned 8-byte integer value. */ +/** + * Returns a parser that parses an unsigned 8-byte integer value. + */ const parser_t* uint64(); -/* Returns a parser that parses an unsigned 4-byte integer value. */ +/** + * Returns a parser that parses an unsigned 4-byte integer value. + */ const parser_t* uint32(); -/* Returns a parser that parses an unsigned 2-byte integer value. */ +/** + * Returns a parser that parses an unsigned 2-byte integer value. + */ const parser_t* uint16(); -/* Returns a parser that parses an unsigned 1-byte integer value. */ +/** + * Returns a parser that parses an unsigned 1-byte integer value. + */ const parser_t* uint8(); -/* Returns a parser that parses a double-precision floating-point value. */ +/** + * Returns a parser that parses a double-precision floating-point + * value. + */ const parser_t* float64(); -/* Returns a parser that parses a single-precision floating-point value. */ +/** + * Returns a parser that parses a single-precision floating-point + * value. + */ const parser_t* float32(); -/* Given another parser, p, returns a parser that skips any whitespace and then applies p. */ +/** + * Given another parser, p, returns a parser that skips any whitespace + * and then applies p. + */ const parser_t* whitespace(const parser_t* p); -/* Given another parser, p, and a function f, returns a parser that applies p, then applies f to everything in the AST of p's result. */ +/** + * Given another parser, p, and a function f, returns a parser that + * applies p, then applies f to everything in the AST of p's result. + */ const parser_t* action(const parser_t* p, const action_t a); -/* Parse a single character *NOT* in charset */ +/** + * Parse a single character *NOT* in the given charset. + */ const parser_t* not_in(const uint8_t *charset, int length); -/* A no-argument parser that succeeds if there is no more input to parse. */ +/** + * A no-argument parser that succeeds if there is no more input to + * parse. + */ const parser_t* end_p(); -/* This parser always fails. */ +/** + * This parser always fails. + */ const parser_t* nothing_p(); -/* Given an null-terminated list of parsers, apply each parser in order. The parse succeeds only if all parsers succeed. */ +/** + * Given a null-terminated list of parsers, apply each parser in order. + * The parse succeeds only if all parsers succeed. + */ const parser_t* sequence(const parser_t* p, ...) __attribute__((sentinel)); -/* Given an array of parsers, p_array, apply each parser in order. The first parser to succeed is the result; if no parsers succeed, the parse fails. */ +/** + * Given an array of parsers, p_array, apply each parser in order. The + * first parser to succeed is the result; if no parsers succeed, the + * parse fails. + */ const parser_t* choice(const parser_t* p, ...) __attribute__((sentinel)); -/* Given two parsers, p1 and p2, this parser succeeds in the following cases: +/** + * Given two parsers, p1 and p2, this parser succeeds in the following + * cases: * - if p1 succeeds and p2 fails * - if both succeed but p1's result is as long as or shorter than p2's */ const parser_t* butnot(const parser_t* p1, const parser_t* p2); -/* Given two parsers, p1 and p2, this parser succeeds in the following cases: +/** + * Given two parsers, p1 and p2, this parser succeeds in the following + * cases: * - if p1 succeeds and p2 fails * - if both succeed but p2's result is shorter than p1's */ const parser_t* difference(const parser_t* p1, const parser_t* p2); -/* Given two parsers, p1 and p2, this parser succeeds if *either* p1 or p2 succeed, but not if they both do. +/** + * Given two parsers, p1 and p2, this parser succeeds if *either* p1 or + * p2 succeed, but not if they both do. */ const parser_t* xor(const parser_t* p1, const parser_t* p2); -/* Given a parser, p, this parser succeeds for zero or more repetitions of p. */ +/** + * Given a parser, p, this parser succeeds for zero or more repetitions + * of p. + */ const parser_t* many(const parser_t* p); -/* Given a parser, p, this parser succeeds for one or more repetitions of p. */ +/** + * Given a parser, p, this parser succeeds for one or more repetitions + * of p. + */ const parser_t* many1(const parser_t* p); -/* Given a parser, p, this parser succeeds for exactly N repetitions of p. */ +/** + * Given a parser, p, this parser succeeds for exactly N repetitions + * of p. + */ const parser_t* repeat_n(const parser_t* p, const size_t n); -/* Given a parser, p, this parser succeeds with the value p parsed or with an empty result. */ +/** + * Given a parser, p, this parser succeeds with the value p parsed or + * with an empty result. + */ const parser_t* optional(const parser_t* p); -/* Given a parser, p, this parser succeeds if p succeeds, but doesn't include p's result in the result. */ +/** + * Given a parser, p, this parser succeeds if p succeeds, but doesn't + * include p's result in the result. + */ const parser_t* ignore(const parser_t* p); -/* Given a parser, p, and a parser for a separator, sep, this parser matches a (possibly empty) list of things that p can parse, separated by sep. - * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy(p, sep) will match a comma-separated list of integers. +/** + * Given a parser, p, and a parser for a separator, sep, this parser + * matches a (possibly empty) list of things that p can parse, + * separated by sep. + * For example, if p is repeat1(range('0','9')) and sep is ch(','), + * sepBy(p, sep) will match a comma-separated list of integers. */ const parser_t* sepBy(const parser_t* p, const parser_t* sep); -/* Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. +/** + * Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy1(p, sep) will match a comma-separated list of integers. */ const parser_t* sepBy1(const parser_t* p, const parser_t* sep); -/* This parser always returns a zero length match, i.e., empty string. */ +/** + * This parser always returns a zero length match, i.e., empty string. + */ const parser_t* epsilon_p(); -/* This parser attaches an attribute function, which returns true or false, to a parser. The function is evaluated over the parser's result AST. +/** + * This parser attaches a predicate function, which returns true or + * false, to a parser. The function is evaluated over the parser's + * result. * The parse only succeeds if the attribute function returns true. */ -const parser_t* attr_bool(const parser_t* p, const attr_bool_t a); +const parser_t* attr_bool(const parser_t* p, predicate_t pred); -/* The 'and' parser is a predicate. It asserts that a conditional syntax is satisfied, but consumes no input. +/** + * The 'and' parser is a predicate. It asserts that a conditional + * syntax is satisfied, but consumes no input. * This is useful for lookahead. As an example: * - * Suppose you already have a parser, hex_p, that parses numbers in hexadecimal format (including the leading '0x'). Then + * Suppose you already have a parser, hex_p, that parses numbers in + * hexadecimal format (including the leading '0x'). Then * sequence(and(token((const uint8_t*)"0x", 2)), hex_p) - * checks to see whether there is a leading "0x", *does not* consume the "0x", and then applies hex_p to parse the hex-formatted number. + * checks to see whether there is a leading "0x", *does not* consume + * the "0x", and then applies hex_p to parse the hex-formatted number. * - * 'and' succeeds if p succeeds, and fails if p fails. Like 'ignore', 'and' does not attach a result to the AST. + * 'and' succeeds if p succeeds, and fails if p fails. Like 'ignore', + * 'and' does not attach a result to the AST. */ const parser_t* and(const parser_t* p); -/* The 'not' parser is a predicate. It asserts that a conditional syntax is *not* satisfied, and consumes no input. +/** + * The 'not' parser is a predicate. It asserts that a conditional + * syntax is *not* satisfied, and consumes no input. * As a somewhat contrived example: * * Since 'choice' applies its arguments in order, the following parser: * sequence(ch('a'), choice(ch('+'), token((const uint8_t*)"++"), NULL), ch('b'), NULL) - * will not parse "a++b", because once choice() has succeeded, it will not backtrack and try other alternatives if a later parser in the sequence - * fails. - * Instead, you can force the use of the second alternative by turning the ch('+') alternative into a sequence with not: + * will not parse "a++b", because once choice() has succeeded, it will + * not backtrack and try other alternatives if a later parser in the + * sequence fails. + * Instead, you can force the use of the second alternative by turning + * the ch('+') alternative into a sequence with not: * sequence(ch('a'), choice(sequence(ch('+'), not(ch('+')), NULL), token((const uint8_t*)"++")), ch('b'), NULL) - * If the input string is "a+b", the first alternative is applied; if the input string is "a++b", the second alternative is applied. + * If the input string is "a+b", the first alternative is applied; if + * the input string is "a++b", the second alternative is applied. */ const parser_t* not(const parser_t* p); /** - * Create a parser that just calls out to another, as yet unknown, parser. + * Create a parser that just calls out to another, as yet unknown, + * parser. * Note that the inner parser gets bound later, with bind_indirect. * This can be used to create recursive parsers. */ parser_t *indirect(); /** - * Set the inner parser of an indirect. See comments on indirect for details. + * Set the inner parser of an indirect. See comments on indirect for + * details. */ void bind_indirect(parser_t* indirect, parser_t* inner); From 642df1f2384565961072614c36d529552d117c35 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Fri, 18 May 2012 12:35:40 +0200 Subject: [PATCH 03/12] Finished action. On to testing. --- src/hammer.c | 24 +++++++++++++++++++++++- src/hammer.h | 2 ++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/hammer.c b/src/hammer.c index 178eeb7..6ce17e1 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -297,7 +297,29 @@ const parser_t* whitespace(const parser_t* p) { return ret; } -const parser_t* action(const parser_t* p, const action_t a) { return &unimplemented; } +typedef struct { + const parser_t *p; + action_t action; +} parse_action_t; + +static parse_result_t* parse_action(void *env, parse_state_t *state) { + parse_action_t *a = (parse_action_t*)env; + if (a->p && a->action) { + parse_result_t *ret = a->action(do_parse(a->p, state)); + return ret; + } else // either the parser's missing or the action's missing + return NULL; +} + +const parser_t* action(const parser_t* p, const action_t a) { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_action; + parse_action_t *env = g_new(parse_action_t, 1); + env->p = p; + env->action = a; + res->env = (void*)env; + return res; +} static parse_result_t* parse_charset(void *env, parse_state_t *state) { uint8_t in = read_bits(&state->input_stream, 8, false); diff --git a/src/hammer.h b/src/hammer.h index c16dd49..b1cbc19 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -36,6 +36,7 @@ typedef enum token_type { TT_SINT, TT_UINT, TT_SEQUENCE, + TT_USER = 64, TT_ERR, TT_MAX } token_type_t; @@ -59,6 +60,7 @@ typedef struct parsed_token { double dbl; float flt; counted_array_t *seq; // a sequence of parsed_token_t's + void *user; }; size_t index; char bit_offset; From 348e22dcfa41e68ce7f502398b4198f3d2cf96c7 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Fri, 18 May 2012 12:37:36 +0200 Subject: [PATCH 04/12] Spelling nazi strikes again --- src/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/internal.h b/src/internal.h index 4cd5bf6..09970e0 100644 --- a/src/internal.h +++ b/src/internal.h @@ -26,7 +26,7 @@ #else #define assert_message(check, message) do { \ if (!(check)) \ - errx(1, "Assertation failed (programmer error): %s", message); \ + errx(1, "Assertion failed (programmer error): %s", message); \ } while(0) #endif #define false 0 From 3a0068d92b1c54dfdaff5788910b4f7e00b47e5f Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Fri, 18 May 2012 12:49:40 +0200 Subject: [PATCH 05/12] Changed resulttype of action_t to parsed_token_t; users shouldn't have to assign arenas in results. --- src/hammer.c | 6 +++--- src/hammer.h | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/hammer.c b/src/hammer.c index 6ce17e1..3631a04 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -305,8 +305,8 @@ typedef struct { static parse_result_t* parse_action(void *env, parse_state_t *state) { parse_action_t *a = (parse_action_t*)env; if (a->p && a->action) { - parse_result_t *ret = a->action(do_parse(a->p, state)); - return ret; + parsed_token_t *tok = a->action(do_parse(a->p, state)); + return make_result(state, tok); } else // either the parser's missing or the action's missing return NULL; } @@ -971,7 +971,7 @@ static void test_whitespace(void) { g_check_parse_failed(whitespace_, "_a", 2); } -parse_result_t* upcase(parse_result_t *p) { +parsed_token_t* upcase(parse_result_t *p) { return NULL; // shut compiler up } diff --git a/src/hammer.h b/src/hammer.h index b1cbc19..f09af09 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -66,10 +66,11 @@ typedef struct parsed_token { char bit_offset; } parsed_token_t; - - -/* If a parse fails, the parse result will be NULL. - * If a parse is successful but there's nothing there (i.e., if end_p succeeds) then there's a parse result but its ast is NULL. +/** + * The result of a successful parse. + * If a parse fails, the parse result will be NULL. + * If a parse is successful but there's nothing there (i.e., if end_p + * succeeds) then there's a parse result but its ast is NULL. */ typedef struct parse_result { const parsed_token_t *ast; @@ -78,13 +79,20 @@ typedef struct parse_result { /** * Type of an action to apply to an AST, used in the action() parser. + * It can be any (user-defined) function that takes a parse_result_t* + * and returns a parsed_token_t*. (This is so that the user doesn't + * have to worry about memory allocation; action() does that for you.) + * Note that the tagged union in parsed_token_t* supports user-defined + * types, so you can create your own token types (corresponding to, + * say, structs) and stuff values for them into the void* in the + * tagged union in parsed_token_t. */ -typedef parse_result_t* (*action_t)(parse_result_t *p); +typedef parsed_token_t* (*action_t)(parse_result_t *p); /** * Type of a boolean attribute-checking function, used in the * attr_bool() parser. It can be any (user-defined) function that takes - * a parse_result_t and returns true or false. + * a parse_result_t* and returns true or false. */ typedef bool (*predicate_t)(parse_result_t *p); From d7582e62af06526a5aeacc1b6ea323535ef403a9 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 00:57:33 +0200 Subject: [PATCH 06/12] Started on DNS --- examples/dns.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 examples/dns.c diff --git a/examples/dns.c b/examples/dns.c new file mode 100644 index 0000000..93b056c --- /dev/null +++ b/examples/dns.c @@ -0,0 +1,36 @@ +#include "../hammer.h" + +bool is_zero(parse_result_t *p) { + +} + +int main(int argc, char **argv) { + + const parser_t dns_header = sequence(bits(16), // ID + bits(1), // QR + bits(4), // opcode + bits(1), // AA + bits(1), // TC + bits(1), // RD + bits(1), // RA + ignore(attr_bool(bits(3), is_zero)), // Z + bits(4), // RCODE + uint16(), // QDCOUNT + uint16(), // ANCOUNT + uint16(), // NSCOUNT + uint16(), // ARCOUNT + NULL); + + const parser_t *dns_question = sequence(; + + bool validate_dns(parse_result_t *p) { + + } + + const parser_t *dns_message = attr_bool(sequence(dns_header, + many(dns_question), + many(dns_answer), + many(dns_authority), + many(dns_additional), + NULL), + validate_dns); From dd3852fdb1d8c4169b5a0de405dd95d8442dcbba Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 02:40:48 +0200 Subject: [PATCH 07/12] Added length_value parser to use with DNS --- src/hammer.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/hammer.c b/src/hammer.c index 0d9cf8c..136a07c 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -846,6 +846,41 @@ const parser_t* attr_bool(const parser_t* p, predicate_t pred) { return res; } +typedef struct { + parser_t *length; + parser_t *value; +} lv_t; + +static parse_result_t* parse_length_value(void *env, parse_state_t *state) { + lv_t *lv = (lv_t*)env; + parse_result_t *len = do_parse(lv->length, state); + if (!len) + return NULL; + if (len->ast->token_type != TT_UINT) + errx(1, "Length parser must return an unsigned integer"); + parser_t epsilon_local = { + .fn = parse_epsilon, + .env = NULL + }; + repeat_t repeat = { + .p = lv->value, + .sep = &epsilon_local, + .count = len->ast->uint, + .min_p = false + } + return parse_many((void*)repeat, state); +} + +const parser_t* length_value(const parser_t* length, const parser_t* value) { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_length_value; + lv_t *env = g_new(lv_t, 1); + env->length = length; + env->value = value; + res->env = (void*)env; + return res; +} + const parser_t* and(const parser_t* p) { return &unimplemented; } static parse_result_t* parse_not(void* env, parse_state_t* state) { @@ -1010,7 +1045,17 @@ static void test_whitespace(void) { } parsed_token_t* upcase(parse_result_t *p) { - return NULL; // shut compiler up + switch(p->ast->token_type) { + case TT_SEQUENCE: + for (size_t i=0; iast->seq->used; ++i) { + upcase((parse_result_t*)p->ast->seq->elements[i]); + return p->ast; + } + case TT_UINT: + // if i'm a char, upcase me + default: + return p->ast; + } } static void test_action(void) { From 6eb93fb655fbff2890d640646909423723936e22 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 02:40:59 +0200 Subject: [PATCH 08/12] ALL THE DOCSTRINGS --- src/hammer.h | 103 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 18 deletions(-) diff --git a/src/hammer.h b/src/hammer.h index f09af09..95da5e2 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -109,12 +109,16 @@ parse_result_t* parse(const parser_t* parser, const uint8_t* input, size_t lengt /** * Given a string, returns a parser that parses that string value. + * + * Result token type: TT_BYTES */ const parser_t* token(const uint8_t *str, const size_t len); /** * Given a single character, returns a parser that parses that * character. + * + * Result token type: TT_UINT */ const parser_t* ch(const uint8_t c); @@ -122,98 +126,118 @@ const parser_t* ch(const uint8_t c); * Given two single-character bounds, lower and upper, returns a parser * that parses a single character within the range [lower, upper] * (inclusive). + * + * Result token type: TT_UINT */ const parser_t* range(const uint8_t lower, const uint8_t upper); /** * Returns a parser that parses the specified number of bits. sign == * true if signed, false if unsigned. + * + * Result token type: TT_SINT if sign == true, TT_UINT if sign == false */ const parser_t* bits(size_t len, bool sign); /** * Returns a parser that parses a signed 8-byte integer value. + * + * Result token type: TT_SINT */ const parser_t* int64(); /** * Returns a parser that parses a signed 4-byte integer value. + * + * Result token type: TT_SINT */ const parser_t* int32(); /** * Returns a parser that parses a signed 2-byte integer value. + * + * Result token type: TT_SINT */ const parser_t* int16(); /** * Returns a parser that parses a signed 1-byte integer value. + * + * Result token type: TT_SINT */ const parser_t* int8(); /** * Returns a parser that parses an unsigned 8-byte integer value. + * + * Result token type: TT_UINT */ const parser_t* uint64(); /** * Returns a parser that parses an unsigned 4-byte integer value. + * + * Result token type: TT_UINT */ const parser_t* uint32(); /** * Returns a parser that parses an unsigned 2-byte integer value. + * + * Result token type: TT_UINT */ const parser_t* uint16(); /** * Returns a parser that parses an unsigned 1-byte integer value. + * + * Result token type: TT_UINT */ const parser_t* uint8(); -/** - * Returns a parser that parses a double-precision floating-point - * value. - */ -const parser_t* float64(); - -/** - * Returns a parser that parses a single-precision floating-point - * value. - */ -const parser_t* float32(); - /** * Given another parser, p, returns a parser that skips any whitespace * and then applies p. + * + * Result token type: p's result type */ const parser_t* whitespace(const parser_t* p); /** * Given another parser, p, and a function f, returns a parser that * applies p, then applies f to everything in the AST of p's result. + * + * Result token type: any */ const parser_t* action(const parser_t* p, const action_t a); /** * Parse a single character *NOT* in the given charset. + * + * Result token type: TT_UINT */ const parser_t* not_in(const uint8_t *charset, int length); /** * A no-argument parser that succeeds if there is no more input to * parse. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* end_p(); /** * This parser always fails. + * + * Result token type: NULL. Always. */ const parser_t* nothing_p(); /** * Given a null-terminated list of parsers, apply each parser in order. * The parse succeeds only if all parsers succeed. + * + * Result token type: TT_SEQUENCE */ const parser_t* sequence(const parser_t* p, ...) __attribute__((sentinel)); @@ -221,6 +245,8 @@ const parser_t* sequence(const parser_t* p, ...) __attribute__((sentinel)); * Given an array of parsers, p_array, apply each parser in order. The * first parser to succeed is the result; if no parsers succeed, the * parse fails. + * + * Result token type: The type of the first successful parser's result. */ const parser_t* choice(const parser_t* p, ...) __attribute__((sentinel)); @@ -229,6 +255,8 @@ const parser_t* choice(const parser_t* p, ...) __attribute__((sentinel)); * cases: * - if p1 succeeds and p2 fails * - if both succeed but p1's result is as long as or shorter than p2's + * + * Result token type: p1's result type. */ const parser_t* butnot(const parser_t* p1, const parser_t* p2); @@ -237,42 +265,56 @@ const parser_t* butnot(const parser_t* p1, const parser_t* p2); * cases: * - if p1 succeeds and p2 fails * - if both succeed but p2's result is shorter than p1's + * + * Result token type: p1's result type. */ const parser_t* difference(const parser_t* p1, const parser_t* p2); /** * Given two parsers, p1 and p2, this parser succeeds if *either* p1 or * p2 succeed, but not if they both do. + * + * Result token type: The type of the result of whichever parser succeeded. */ const parser_t* xor(const parser_t* p1, const parser_t* p2); /** * Given a parser, p, this parser succeeds for zero or more repetitions * of p. + * + * Result token type: TT_SEQUENCE */ const parser_t* many(const parser_t* p); /** * Given a parser, p, this parser succeeds for one or more repetitions * of p. + * + * Result token type: TT_SEQUENCE */ const parser_t* many1(const parser_t* p); /** * Given a parser, p, this parser succeeds for exactly N repetitions * of p. + * + * Result token type: TT_SEQUENCE */ const parser_t* repeat_n(const parser_t* p, const size_t n); /** * Given a parser, p, this parser succeeds with the value p parsed or * with an empty result. + * + * Result token type: If p succeeded, the type of its result; if not, TT_NONE. */ const parser_t* optional(const parser_t* p); /** * Given a parser, p, this parser succeeds if p succeeds, but doesn't * include p's result in the result. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* ignore(const parser_t* p); @@ -282,31 +324,50 @@ const parser_t* ignore(const parser_t* p); * separated by sep. * For example, if p is repeat1(range('0','9')) and sep is ch(','), * sepBy(p, sep) will match a comma-separated list of integers. + * + * Result token type: TT_SEQUENCE */ const parser_t* sepBy(const parser_t* p, const parser_t* sep); /** * Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. * For example, if p is repeat1(range('0','9')) and sep is ch(','), sepBy1(p, sep) will match a comma-separated list of integers. + * + * Result token type: TT_SEQUENCE */ const parser_t* sepBy1(const parser_t* p, const parser_t* sep); /** * This parser always returns a zero length match, i.e., empty string. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* epsilon_p(); +/** + * This parser applies its first argument to read an unsigned integer + * value, then applies its second argument that many times. length + * should parse an unsigned integer value; this is checked at runtime. + * Specifically, the token_type of the returned token must be TT_UINT. + * In future we might relax this to include TT_USER but don't count on it. + * + * Result token type: TT_SEQUENCE + */ +const parser_t* length_value(const parser_t* length, const parser_t* value); + /** * This parser attaches a predicate function, which returns true or * false, to a parser. The function is evaluated over the parser's * result. * The parse only succeeds if the attribute function returns true. + * + * Result token type: p's result type if pred succeeded, NULL otherwise. */ const parser_t* attr_bool(const parser_t* p, predicate_t pred); /** - * The 'and' parser is a predicate. It asserts that a conditional - * syntax is satisfied, but consumes no input. + * The 'and' parser asserts that a conditional syntax is satisfied, + * but doesn't consume that conditional syntax. * This is useful for lookahead. As an example: * * Suppose you already have a parser, hex_p, that parses numbers in @@ -315,14 +376,15 @@ const parser_t* attr_bool(const parser_t* p, predicate_t pred); * checks to see whether there is a leading "0x", *does not* consume * the "0x", and then applies hex_p to parse the hex-formatted number. * - * 'and' succeeds if p succeeds, and fails if p fails. Like 'ignore', - * 'and' does not attach a result to the AST. + * 'and' succeeds if p succeeds, and fails if p fails. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* and(const parser_t* p); /** - * The 'not' parser is a predicate. It asserts that a conditional - * syntax is *not* satisfied, and consumes no input. + * The 'not' parser asserts that a conditional syntax is *not* + * satisfied, but doesn't consume that conditional syntax. * As a somewhat contrived example: * * Since 'choice' applies its arguments in order, the following parser: @@ -335,6 +397,8 @@ const parser_t* and(const parser_t* p); * sequence(ch('a'), choice(sequence(ch('+'), not(ch('+')), NULL), token((const uint8_t*)"++")), ch('b'), NULL) * If the input string is "a+b", the first alternative is applied; if * the input string is "a++b", the second alternative is applied. + * + * Result token type: None. The parse_result_t exists but its AST is NULL. */ const parser_t* not(const parser_t* p); @@ -343,6 +407,9 @@ const parser_t* not(const parser_t* p); * parser. * Note that the inner parser gets bound later, with bind_indirect. * This can be used to create recursive parsers. + * + * Result token type: the type of whatever parser is bound to it with + * bind_indirect(). */ parser_t *indirect(); From 2ccb9d4a363e3d3f094c3ba221c0cb2345eaec93 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 02:41:33 +0200 Subject: [PATCH 09/12] DNS first draft mostly done, just need to write validator and action for full msg --- examples/dns.c | 81 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/examples/dns.c b/examples/dns.c index 93b056c..027d675 100644 --- a/examples/dns.c +++ b/examples/dns.c @@ -1,36 +1,85 @@ #include "../hammer.h" bool is_zero(parse_result_t *p) { + return (0 == p->ast->uint); +} +bool validate_dns(parse_result_t *p) { + } int main(int argc, char **argv) { - const parser_t dns_header = sequence(bits(16), // ID - bits(1), // QR - bits(4), // opcode - bits(1), // AA - bits(1), // TC - bits(1), // RD - bits(1), // RA - ignore(attr_bool(bits(3), is_zero)), // Z - bits(4), // RCODE + const parser_t dns_header = sequence(bits(16, false), // ID + bits(1, false), // QR + bits(4, false), // opcode + bits(1, false), // AA + bits(1, false), // TC + bits(1, false), // RD + bits(1, false), // RA + ignore(attr_bool(bits(3, false), is_zero)), // Z + bits(4, false), // RCODE uint16(), // QDCOUNT uint16(), // ANCOUNT uint16(), // NSCOUNT uint16(), // ARCOUNT NULL); - const parser_t *dns_question = sequence(; + const parser_t *dns_question = sequence(length_value(uint8(), uint8()), // QNAME + uint16(), // QTYPE + uint16(), // QCLASS + NULL); - bool validate_dns(parse_result_t *p) { + const parser_t *letter = choice(range('a', 'z'), + range('A', 'Z'), + NULL); + + const parser_t *let_dig = choice(letter, + range('0', '9'), + NULL); + + const parser_t *ldh_str = many1(choice(let_dig, + ch('-'), + NULL)); + + const parser_t *label = sequence(letter, + optional(sequence(optional(ldh_str), + let_dig, + NULL)), + NULL); + + /** + * You could write it like this ... + * parser_t *indirect_subdomain = indirect(); + * const parser_t *subdomain = choice(label, + * sequence(indirect_subdomain, + * ch('.'), + * label, + * NULL), + * NULL); + * bind_indirect(indirect_subdomain, subdomain); + * + * ... but this is easier and equivalent + */ + + parser_t *subdomain = sepBy1(label, ch('.')); + + const parser_t *domain = choice(subdomain, + ch(' '), + NULL); + + const parser_t *dns_rr = sequence(domain, // NAME + uint16(), // TYPE + uint16(), // CLASS + uint32(), // TTL + length_value(uint16(), uint8()) // RDLENGTH+RDATA + NULL); - } const parser_t *dns_message = attr_bool(sequence(dns_header, - many(dns_question), - many(dns_answer), - many(dns_authority), - many(dns_additional), + dns_question, + many(dns_rr), + end_p(), NULL), validate_dns); +} From e2af24fe809f090c5f03689539df96a7498fa2ec Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 02:55:00 +0200 Subject: [PATCH 10/12] action() test fully written. But it segfaults. Debugging. --- src/hammer.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/hammer.c b/src/hammer.c index 136a07c..4b177a8 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -847,8 +847,8 @@ const parser_t* attr_bool(const parser_t* p, predicate_t pred) { } typedef struct { - parser_t *length; - parser_t *value; + const parser_t *length; + const parser_t *value; } lv_t; static parse_result_t* parse_length_value(void *env, parse_state_t *state) { @@ -867,8 +867,8 @@ static parse_result_t* parse_length_value(void *env, parse_state_t *state) { .sep = &epsilon_local, .count = len->ast->uint, .min_p = false - } - return parse_many((void*)repeat, state); + }; + return parse_many(&repeat, state); } const parser_t* length_value(const parser_t* length, const parser_t* value) { @@ -1044,22 +1044,35 @@ static void test_whitespace(void) { g_check_parse_failed(whitespace_, "_a", 2); } +#include + parsed_token_t* upcase(parse_result_t *p) { switch(p->ast->token_type) { case TT_SEQUENCE: for (size_t i=0; iast->seq->used; ++i) { upcase((parse_result_t*)p->ast->seq->elements[i]); - return p->ast; + return (parsed_token_t*)p->ast; } case TT_UINT: - // if i'm a char, upcase me + { + parsed_token_t *ret = (parsed_token_t*)p->ast; + ret->uint = toupper(ret->uint); + return ret; + } default: - return p->ast; + return (parsed_token_t*)p->ast; } } static void test_action(void) { - const parser_t *action_ = action(sequence(choice(ch('a'), ch('A'), NULL), choice(ch('b'), ch('B'), NULL), NULL), upcase); + const parser_t *action_ = action(sequence(choice(ch('a'), + ch('A'), + NULL), + choice(ch('b'), + ch('B'), + NULL), + NULL), + upcase); g_check_parse_ok(action_, "ab", 2, "(u0x41, u0x42)"); g_check_parse_ok(action_, "AB", 2, "(u0x41, u0x42)"); From f921ece53fa6dfa96aab0dd2a7ff203032f29a98 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 03:57:27 +0200 Subject: [PATCH 11/12] action() works. Will finish DNS tomorrow. --- src/hammer.c | 40 ++++++++++++++++++++++++++++------------ src/hammer.h | 2 +- src/pprint.c | 4 ++++ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/hammer.c b/src/hammer.c index 4b177a8..1114acb 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -344,8 +344,10 @@ typedef struct { static parse_result_t* parse_action(void *env, parse_state_t *state) { parse_action_t *a = (parse_action_t*)env; if (a->p && a->action) { - parsed_token_t *tok = a->action(do_parse(a->p, state)); - return make_result(state, tok); + parse_result_t *tmp = do_parse(a->p, state); + //parsed_token_t *tok = a->action(do_parse(a->p, state)); + const parsed_token_t *tok = a->action(tmp); + return make_result(state, (parsed_token_t*)tok); } else // either the parser's missing or the action's missing return NULL; } @@ -1046,21 +1048,35 @@ static void test_whitespace(void) { #include -parsed_token_t* upcase(parse_result_t *p) { +const parsed_token_t* upcase(parse_result_t *p) { switch(p->ast->token_type) { case TT_SEQUENCE: - for (size_t i=0; iast->seq->used; ++i) { - upcase((parse_result_t*)p->ast->seq->elements[i]); - return (parsed_token_t*)p->ast; + { + parsed_token_t *ret = a_new_(p->arena, parsed_token_t, 1); + counted_array_t *seq = carray_new_sized(p->arena, p->ast->seq->used); + ret->token_type = TT_SEQUENCE; + for (size_t i=0; iast->seq->used; ++i) { + if (TT_UINT == ((parsed_token_t*)p->ast->seq->elements[i])->token_type) { + parsed_token_t *tmp = a_new_(p->arena, parsed_token_t, 1); + tmp->token_type = TT_UINT; + tmp->uint = toupper(((parsed_token_t*)p->ast->seq->elements[i])->uint); + carray_append(seq, tmp); + } else { + carray_append(seq, p->ast->seq->elements[i]); + } + } + ret->seq = seq; + return (const parsed_token_t*)ret; } case TT_UINT: { - parsed_token_t *ret = (parsed_token_t*)p->ast; - ret->uint = toupper(ret->uint); - return ret; + parsed_token_t *ret = a_new_(p->arena, parsed_token_t, 1); + ret->token_type = TT_UINT; + ret->uint = toupper(p->ast->uint); + return (const parsed_token_t*)ret; } default: - return (parsed_token_t*)p->ast; + return p->ast; } } @@ -1074,8 +1090,8 @@ static void test_action(void) { NULL), upcase); - g_check_parse_ok(action_, "ab", 2, "(u0x41, u0x42)"); - g_check_parse_ok(action_, "AB", 2, "(u0x41, u0x42)"); + g_check_parse_ok(action_, "ab", 2, "(u0x41 u0x42)"); + g_check_parse_ok(action_, "AB", 2, "(u0x41 u0x42)"); } static void test_not_in(void) { diff --git a/src/hammer.h b/src/hammer.h index 95da5e2..a0b93e5 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -87,7 +87,7 @@ typedef struct parse_result { * say, structs) and stuff values for them into the void* in the * tagged union in parsed_token_t. */ -typedef parsed_token_t* (*action_t)(parse_result_t *p); +typedef const parsed_token_t* (*action_t)(parse_result_t *p); /** * Type of a boolean attribute-checking function, used in the diff --git a/src/pprint.c b/src/pprint.c index 250ecfb..ac0d02d 100644 --- a/src/pprint.c +++ b/src/pprint.c @@ -94,6 +94,10 @@ static inline void append_buf_c(struct result_buf *buf, char v) { static void unamb_sub(const parsed_token_t* tok, struct result_buf *buf) { char* tmpbuf; int len; + if (!tok) { + append_buf(buf, "NULL", 4); + return; + } switch (tok->token_type) { case TT_NONE: append_buf(buf, "null", 4); From 3bb11afeac069ee706eb904a6f6d2fd3fd489cb5 Mon Sep 17 00:00:00 2001 From: "Meredith L. Patterson" Date: Tue, 22 May 2012 16:10:40 +0200 Subject: [PATCH 12/12] int64 test was wrong, fixed it --- src/hammer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hammer.c b/src/hammer.c index 1114acb..d36a744 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -965,7 +965,7 @@ static void test_range(void) { static void test_int64(void) { const parser_t *int64_ = int64(); - g_check_parse_ok(int64_, "\xff\xff\xff\xfe\x00\x00\x00\x00", 8, "s-0x200000000"); + g_check_parse_ok(int64_, "\xff\xff\xff\xfe\x00\x00\x00\x00", 8, "s0x200000000"); g_check_parse_failed(int64_, "\xff\xff\xff\xfe\x00\x00\x00", 7); }