diff --git a/src/hammer.c b/src/hammer.c index 0aed6a8..fb3d62a 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -22,6 +22,7 @@ #include #include #include +#include #define a_new_(arena, typ, count) ((typ*)arena_malloc((arena), sizeof(typ)*(count))) #define a_new(typ, count) a_new_(state->arena, typ, count) @@ -35,39 +36,170 @@ guint djbhash(const uint8_t *buf, size_t len) { return hash; } -parse_result_t* do_parse(const parser_t* parser, parse_state_t *state) { - // TODO(thequux): add caching here. - parser_cache_key_t *key; - key = a_new(parser_cache_key_t, 1); - memset(key, 0, sizeof(*key)); - key->input_pos = state->input_stream; - key->parser = parser; +parser_cache_value_t* recall(parser_cache_key_t *k, parse_state_t *state) { + parser_cache_value_t *cached = g_hash_table_lookup(state->cache, k); + head_t *head = g_hash_table_lookup(state->recursion_heads, k); + if (!head) { // No heads found + return cached; + } else { // Some heads found + if (!cached && head->head_parser != k->parser && !g_slist_find(head->involved_set, k->parser)) { + // Nothing in the cache, and the key parser is not involved + parse_result_t *tmp = g_new(parse_result_t, 1); + tmp->ast = NULL; tmp->arena = state->arena; + parser_cache_value_t *ret = g_new(parser_cache_value_t, 1); + ret->value_type = PC_RIGHT; ret->right = tmp; + return ret; + } + if (g_slist_find(head->eval_set, k->parser)) { + // Something is in the cache, and the key parser is in the eval set. Remove the key parser from the eval set of the head. + head->eval_set = g_slist_remove_all(head->eval_set, k->parser); + parse_result_t *tmp_res = k->parser->fn(k->parser->env, state); + if (tmp_res) + tmp_res->arena = state->arena; + // we know that cached has an entry here, modify it + cached->value_type = PC_RIGHT; + cached->right = tmp_res; + } + return cached; + } +} + +/* Setting up the left recursion. We have the LR for the rule head; + * we modify the involved_sets of all LRs in the stack, until we + * see the current parser again. + */ + +void setupLR(const parser_t *p, GQueue *stack, LR_t *rec_detect) { + if (!rec_detect->head) { + head_t *some = g_new(head_t, 1); + some->head_parser = p; some->involved_set = NULL; some->eval_set = NULL; + rec_detect->head = some; + } + size_t i = 0; + LR_t *lr = g_queue_peek_nth(stack, i); + while (lr && lr->rule != p) { + lr->head = rec_detect->head; + lr->head->involved_set = g_slist_prepend(lr->head->involved_set, (gpointer)lr->rule); + } +} + +/* If recall() returns NULL, we need to store a dummy failure in the cache and compute the + * future parse. + */ + +parse_result_t* grow(parser_cache_key_t *k, parse_state_t *state, head_t *head) { + // Store the head into the recursion_heads + g_hash_table_replace(state->recursion_heads, k, head); + parser_cache_value_t *old_cached = g_hash_table_lookup(state->cache, k); + if (!old_cached || PC_LEFT == old_cached->value_type) + errx(1, "impossible match"); + parse_result_t *old_res = old_cached->right; - // check to see if there is already a result for this object... - if (g_hash_table_contains(state->cache, &key)) { - // it exists! - // TODO(thequux): handle left recursion case - return g_hash_table_lookup(state->cache, &key); + // reset the eval_set of the head of the recursion at each beginning of growth + head->eval_set = head->involved_set; + parse_result_t *tmp_res; + if (k->parser) { + tmp_res = k->parser->fn(k->parser->env, state); + if (tmp_res) + tmp_res->arena = state->arena; + } else + tmp_res = NULL; + if (tmp_res) { + if ((old_res->ast->index < tmp_res->ast->index) || + (old_res->ast->index == tmp_res->ast->index && old_res->ast->bit_offset < tmp_res->ast->bit_offset)) { + parser_cache_value_t *v = g_new(parser_cache_value_t, 1); + v->value_type = PC_RIGHT; v->right = tmp_res; + g_hash_table_replace(state->cache, k, v); + return grow(k, state, head); + } else { + // we're done with growing, we can remove data from the recursion head + g_hash_table_remove(state->recursion_heads, k); + parser_cache_value_t *cached = g_hash_table_lookup(state->cache, k); + if (cached && PC_RIGHT == cached->value_type) { + return cached->right; + } else { + errx(1, "impossible match"); + } + } } else { - // It doesn't exist... run the - parse_result_t *res; + g_hash_table_remove(state->recursion_heads, k); + return old_res; + } +} + +parse_result_t* lr_answer(parser_cache_key_t *k, parse_state_t *state, LR_t *growable) { + if (growable->head) { + if (growable->head->head_parser != k->parser) { + // not the head rule, so not growing + return growable->seed; + } + else { + // update cache + parser_cache_value_t *v = g_new(parser_cache_value_t, 1); + v->value_type = PC_RIGHT; v->right = growable->seed; + g_hash_table_replace(state->cache, k, v); + if (!growable->seed) + return NULL; + else + return grow(k, state, growable->head); + } + } else { + errx(1, "lrAnswer with no head"); + } +} + +/* Warth's recursion. Hi Alessandro! */ +parse_result_t* do_parse(const parser_t* parser, parse_state_t *state) { + parser_cache_key_t *key = a_new(parser_cache_key_t, 1); + key->input_pos = state->input_stream; key->parser = parser; + parser_cache_value_t *m = recall(key, state); + // check to see if there is already a result for this object... + if (!m) { + // It doesn't exist, so create a dummy result to cache + LR_t *base = a_new(LR_t, 1); + base->seed = NULL; base->rule = parser; base->head = NULL; + g_queue_push_head(state->lr_stack, base); + // cache it + parser_cache_value_t *dummy = a_new(parser_cache_value_t, 1); + dummy->value_type = PC_LEFT; dummy->left = base; + g_hash_table_replace(state->cache, key, dummy); + // parse the input + parse_result_t *tmp_res; if (parser) { - res = parser->fn(parser->env, state); - if (res) - res->arena = state->arena; + tmp_res = parser->fn(parser->env, state); + if (tmp_res) + tmp_res->arena = state->arena; } else - res = NULL; + tmp_res = NULL; if (state->input_stream.overrun) - res = NULL; // overrun is always failure. - // update the cache - g_hash_table_replace(state->cache, &key, res); + return NULL; // overrun is always failure. #ifdef CONSISTENCY_CHECK - if (!res) { + if (!tmp_res) { state->input_stream = INVALID; - state->input_stream.input = key.input_pos.input; + state->input_stream.input = key->input_pos.input; } #endif - return res; + // the base variable has passed equality tests with the cache + g_queue_pop_head(state->lr_stack); + // setupLR, used below, mutates the LR to have a head if appropriate, so we check to see if we have one + if (NULL == base->head) { + parser_cache_value_t *right = a_new(parser_cache_value_t, 1); + right->value_type = PC_RIGHT; right->right = tmp_res; + g_hash_table_replace(state->cache, key, right); + return tmp_res; + } else { + base->seed = tmp_res; + parse_result_t *res = lr_answer(key, state, base); + return res; + } + } else { + // it exists! + if (PC_LEFT == m->value_type) { + setupLR(parser, state->lr_stack, m->left); + return m->left->seed; // BUG: this might not be correct + } else { + return m->right; + } } } @@ -611,6 +743,22 @@ const parser_t* epsilon_p() { res->env = NULL; return res; } + + +static parse_result_t* parse_indirect(void* env, parse_state_t* state) { + return do_parse(env, state); +} +void bind_indirect(parser_t* indirect, parser_t* inner) { + indirect->env = inner; +} + +parser_t* indirect() { + parser_t *res = g_new(parser_t, 1); + res->fn = parse_indirect; + res->env = NULL; + return res; +} + const parser_t* attr_bool(const parser_t* p, attr_bool_t a) { return &unimplemented; } const parser_t* and(const parser_t* p) { return &unimplemented; } @@ -651,8 +799,13 @@ parse_result_t* parse(const parser_t* parser, const uint8_t* input, size_t lengt parse_state->input_stream.overrun = 0; parse_state->input_stream.endianness = BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN; parse_state->input_stream.length = length; + parse_state->lr_stack = g_queue_new(); + parse_state->recursion_heads = g_hash_table_new(cache_key_hash, + cache_key_equal); parse_state->arena = arena; parse_result_t *res = do_parse(parser, parse_state); + g_queue_free(parse_state->lr_stack); + g_hash_table_destroy(parse_state->recursion_heads); // tear down the parse state g_hash_table_destroy(parse_state->cache); if (!res) @@ -851,7 +1004,7 @@ static void test_xor(void) { static void test_many(void) { const parser_t *many_ = many(choice(ch('a'), ch('b'), NULL)); - for (int i = 0; i < 100; i++) { + for (int i = 0; i < 10000; i++) { g_check_parse_ok(many_, "adef", 4, "(s0x61)"); g_check_parse_ok(many_, "bdef", 4, "(s0x62)"); g_check_parse_ok(many_, "aabbabadef", 10, "(s0x61 s0x61 s0x62 s0x62 s0x61 s0x62 s0x61)"); diff --git a/src/hammer.h b/src/hammer.h index adbfcc7..c3c2cf7 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -20,36 +20,15 @@ #include #include #include "allocator.h" -/* The state of the parser. - * - * Members: - * input - the entire string being parsed - * index - current position in input - * length - size of input - * cache - a hash table describing the state of the parse, including partial parse_results. It's a hash table from parser_cache_key_t to parse_state_t. - * - */ + #define BYTE_BIG_ENDIAN 0x1 #define BIT_BIG_ENDIAN 0x2 #define BIT_LITTLE_ENDIAN 0x0 #define BYTE_LITTLE_ENDIAN 0x0 typedef int bool; -typedef struct input_stream { - // This should be considered to be a really big value type. - const uint8_t *input; - size_t index; - size_t length; - char bit_offset; - char endianness; - char overrun; -} input_stream_t; - -typedef struct parse_state { - GHashTable *cache; - input_stream_t input_stream; - arena_t arena; -} parse_state_t; + +typedef struct parse_state parse_state_t; typedef enum token_type { TT_NONE, @@ -81,6 +60,8 @@ typedef struct parsed_token { float flt; counted_array_t *seq; // a sequence of parsed_token_t's }; + size_t index; + char bit_offset; } parsed_token_t; @@ -242,4 +223,16 @@ const parser_t* and(const parser_t* p); */ const parser_t* not(const parser_t* p); +/** + * Create a parser that just calls out to another, as yet unknown, parser. + * Note that the inner parser gets bound later, with bind_indirect. + * This can be used to create recursive parsers. + */ +parser_t *indirect(); + +/** + * Set the inner parser of an indirect. See comments on indirect for details. + */ +void bind_indirect(parser_t* indirect, parser_t* inner); + #endif // #ifndef HAMMER_HAMMER__H diff --git a/src/internal.h b/src/internal.h index 6f50079..4cd5bf6 100644 --- a/src/internal.h +++ b/src/internal.h @@ -32,26 +32,90 @@ #define false 0 #define true 1 +typedef struct input_stream { + // This should be considered to be a really big value type. + const uint8_t *input; + size_t index; + size_t length; + char bit_offset; + char endianness; + char overrun; +} input_stream_t; + +/* The state of the parser. + * + * Members: + * cache - a hash table describing the state of the parse, including partial parse_results. It's a hash table from parser_cache_key_t to parser_cache_value_t. + * input_stream - the input stream at this state. + * arena - the arena that has been allocated for the parse this state is in. + * lr_stack - a stack of LRs, used in Warth's recursion + * recursion_heads - table of recursion heads. Keys are parse_cache_key_t's with only an input_state_t (parser can be NULL), values are head_t. + * + */ + +struct parse_state { + GHashTable *cache; + input_stream_t input_stream; + arena_t arena; + GQueue *lr_stack; + GHashTable *recursion_heads; +}; + +/* The (location, parser) tuple used to key the cache. + */ typedef struct parser_cache_key { input_stream_t input_pos; const parser_t *parser; } parser_cache_key_t; +/* A value in the cache is either of value Left or Right (this is a + * holdover from Scala, which used Either here). Left corresponds to + * LR_t, which is for left recursion; Right corresponds to + * parse_result_t. + */ + typedef enum parser_cache_value_type { - PC_BASE, - PC_IN_RECURSION, - PC_LRESULT, - PC_RESULT + PC_LEFT, + PC_RIGHT } parser_cache_value_type_t; + +/* A recursion head. + * + * Members: + * head_parser - the parse rule that started this recursion + * involved_set - A list of rules (parser_t's) involved in the recursion + * eval_set - + */ +typedef struct head { + const parser_t *head_parser; + GSList *involved_set; + GSList *eval_set; +} head_t; + + +/* A left recursion. + * + * Members: + * seed - + * rule - + * head - + */ +typedef struct LR { + parse_result_t *seed; + const parser_t *rule; + head_t *head; +} LR_t; + +/* Tagged union for values in the cache: either LR's (Left) or + * parse_result_t's (Right). + */ typedef struct parser_cache_value { parser_cache_value_type_t value_type; union { - int base; - parse_result_t *in_recursion; - parse_result_t *lresult; - parse_result_t *result; + LR_t *left; + parse_result_t *right; }; } parser_cache_value_t; diff --git a/src/pprint.c b/src/pprint.c index c7f58c9..32f7b99 100644 --- a/src/pprint.c +++ b/src/pprint.c @@ -20,6 +20,7 @@ #include #include #include "hammer.h" +#include typedef struct pp_state { int delta; @@ -110,10 +111,12 @@ static void unamb_sub(const parsed_token_t* tok, struct result_buf *buf) { case TT_SINT: len = asprintf(&tmpbuf, "u%#lx", tok->sint); append_buf(buf, tmpbuf, len); + free(tmpbuf); break; case TT_UINT: len = asprintf(&tmpbuf, "s%#lx", tok->uint); append_buf(buf, tmpbuf, len); + free(tmpbuf); break; case TT_ERR: append_buf(buf, "ERR", 3);