From 2845a9391e90fab720d7bb62dfdfdf92f291759e Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 3 Sep 2015 15:03:01 +0200 Subject: [PATCH 01/21] add API and backend hooks for iterative (chunked) parsing --- src/hammer.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++- src/hammer.h | 23 ++++++++++++++++++++ src/internal.h | 18 +++++++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/hammer.c b/src/hammer.c index 443c77b..991008d 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -43,6 +43,7 @@ typedef struct { +#define DEFAULT_ENDIANNESS (BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN) HParseResult* h_parse(const HParser* parser, const uint8_t* input, size_t length) { return h_parse__m(&system_allocator, parser, input, length); @@ -53,7 +54,7 @@ HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t* .index = 0, .bit_offset = 0, .overrun = 0, - .endianness = BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN, + .endianness = DEFAULT_ENDIANNESS, .length = length, .input = input }; @@ -96,3 +97,59 @@ int h_compile__m(HAllocator* mm__, HParser* parser, HParserBackend backend, cons parser->backend = backend; return ret; } + + +HSuspendedParser* h_parse_start(const HParser* parser) { + return h_parse_start__m(&system_allocator, parser); +} +HSuspendedParser* h_parse_start__m(HAllocator* mm__, const HParser* parser) { + if(!backends[parser->backend]->parse_start) + return NULL; + + // allocate and init suspended state + HSuspendedParser *s = h_new(HSuspendedParser, 1); + if(!s) + return NULL; + s->mm__ = mm__; + s->parser = parser; + s->backend_state = NULL; + s->endianness = DEFAULT_ENDIANNESS; + + // backend-specific initialization + // should allocate s->backend_state + backends[parser->backend]->parse_start(s); + + return s; +} + +bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length) { + assert(backends[s->parser->backend]->parse_chunk != NULL); + + // input + HInputStream input_stream = { + .index = 0, + .bit_offset = 0, + .overrun = 0, + .endianness = s->endianness, + .length = length, + .input = input + }; + + // process chunk + backends[s->parser->backend]->parse_chunk(s, &input_stream); + s->endianness = input_stream.endianness; + + return !input_stream.overrun; // parser wants no more input? done. +} + +HParseResult* h_parse_finish(HSuspendedParser* s) { + assert(backends[s->parser->backend]->parse_finish != NULL); + + HAllocator *mm__ = s->mm__; + + HParseResult *r = backends[s->parser->backend]->parse_finish(s); + // NB: backend should have freed backend_state + h_free(s); + + return r; +} diff --git a/src/hammer.h b/src/hammer.h index 42c7345..1be297c 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -140,6 +140,8 @@ typedef struct HParser_ { HCFChoice *desugared; /* if the parser can be desugared, its desugared form */ } HParser; +typedef struct HSuspendedParser_ HSuspendedParser; + /** * Type of an action to apply to an AST, used in the action() parser. * It can be any (user-defined) function that takes a HParseResult* @@ -265,6 +267,27 @@ typedef struct HBenchmarkResults_ { */ HAMMER_FN_DECL(HParseResult*, h_parse, const HParser* parser, const uint8_t* input, size_t length); +/** + * Initialize a parser for iteratively consuming an input stream in chunks. + * This is only supported by some backends. + * + * Result is NULL if not supported by the backend. + */ +HAMMER_FN_DECL(HSuspendedParser*, h_parse_start, const HParser* parser); + +/** + * Run a suspended parser (as returned by h_parse_start) on a chunk of input. + * + * Returns true if the parser is done (needs no more input). + */ +bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length); + +/** + * Finish an iterative parse. Signals the end of input to the backend and + * returns the parse result. + */ +HParseResult* h_parse_finish(HSuspendedParser* s); + /** * Given a string, returns a parser that parses that string value. * diff --git a/src/internal.h b/src/internal.h index 9aac4ee..8c79976 100644 --- a/src/internal.h +++ b/src/internal.h @@ -210,10 +210,28 @@ struct HParseState_ { HSlist *symbol_table; // its contents are HHashTables }; +struct HSuspendedParser_ { + HAllocator *mm__; + const HParser *parser; + void *backend_state; + + // the only part of HInputStream that carries across chunks + uint8_t endianness; +}; + typedef struct HParserBackendVTable_ { int (*compile)(HAllocator *mm__, HParser* parser, const void* params); HParseResult* (*parse)(HAllocator *mm__, const HParser* parser, HInputStream* stream); void (*free)(HParser* parser); + + void (*parse_start)(HSuspendedParser *s); + // parse_start should allocate backend_state. + void (*parse_chunk)(HSuspendedParser *s, HInputStream *input); + // when parse_chunk leaves input.overrun unset, parse is done. else: + // parse_chunk MUST consume all input, integrating it into backend_state. + // calling parse_chunk again after parse is done should have no effect. + HParseResult *(*parse_finish)(HSuspendedParser *s); + // parse_finish must free backend_state. } HParserBackendVTable; From f1d6d0bc5efeb247b60af69200b905a733028d84 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 3 Sep 2015 16:24:47 +0200 Subject: [PATCH 02/21] split h_llk_parse into start/chunk/finish internally --- src/backends/llk.c | 117 ++++++++++++++++++++++++++++++++++----------- src/internal.h | 6 +-- 2 files changed, 92 insertions(+), 31 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index afccb74..af75594 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -259,44 +259,74 @@ void h_llk_free(HParser *parser) /* LL(k) driver */ -HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +typedef struct { + HArena *arena; // will hold the results + HArena *tarena; // tmp, deleted after parse + HSlist *stack; + HCountedArray *seq; // accumulates current parse result +} HLLkState; + +// in order to construct the parse tree, we delimit the symbol stack into +// frames corresponding to production right-hand sides. since only left-most +// derivations are produced this linearization is unique. +// the 'mark' allocated below simply reserves a memory address to use as the +// frame delimiter. +// nonterminals, instead of being popped and forgotten, are put back onto the +// stack below the mark to tell us which validations and semantic actions to +// execute on their corresponding result. +// also on the stack below the mark, we store the previously accumulated +// value for the surrounding production. +static int dummy; +static void *MARK = &dummy; // stack frame delimiter + +static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) { const HLLkTable *table = parser->backend_data; assert(table != NULL); - HArena *arena = h_new_arena(mm__, 0); // will hold the results - HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse - HSlist *stack = h_slist_new(tarena); - HCountedArray *seq = h_carray_new(arena); // accumulates current parse result - - // in order to construct the parse tree, we delimit the symbol stack into - // frames corresponding to production right-hand sides. since only left-most - // derivations are produced this linearization is unique. - // the 'mark' allocated below simply reserves a memory address to use as the - // frame delimiter. - // nonterminals, instead of being popped and forgotten, are put back onto the - // stack below the mark to tell us which validations and semantic actions to - // execute on their corresponding result. - // also on the stack below the mark, we store the previously accumulated - // value for the surrounding production. - void *mark = h_arena_malloc(tarena, 1); + HLLkState *s = h_new(HLLkState, 1); + s->arena = h_new_arena(mm__, 0); + s->tarena = h_new_arena(mm__, 0); + s->stack = h_slist_new(s->tarena); + s->seq = h_carray_new(s->arena); // initialize with the start symbol on the stack. - h_slist_push(stack, table->start); + h_slist_push(s->stack, table->start); + + return s; +} + +// returns partial result or NULL +static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, + HInputStream* stream, bool last_chunk) +{ + HParsedToken *tok = NULL; // will hold result token + HCFChoice *x = NULL; // current symbol (from top of stack) + + const HLLkTable *table = parser->backend_data; + assert(table != NULL); + + HArena *arena = s->arena; + HArena *tarena = s->tarena; + HSlist *stack = s->stack; + HCountedArray *seq = s->seq; + + if(!seq) + return NULL; // parse already failed // when we empty the stack, the parse is complete. while(!h_slist_empty(stack)) { // pop top of stack for inspection - HCFChoice *x = h_slist_pop(stack); + x = h_slist_pop(stack); assert(x != NULL); - if(x != mark && x->type == HCF_CHOICE) { + if(x != MARK && x->type == HCF_CHOICE) { // x is a nonterminal; apply the appropriate production and continue // push stack frame h_slist_push(stack, seq); // save current partial value h_slist_push(stack, x); // save the nonterminal - h_slist_push(stack, mark); // frame delimiter + h_slist_push(stack, MARK); // frame delimiter // open a fresh result sequence seq = h_carray_new(arena); @@ -319,11 +349,10 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* } // the top of stack is such that there will be a result... - HParsedToken *tok; // will hold result token tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->index = stream->index; tok->bit_offset = stream->bit_offset; - if(x == mark) { + if(x == MARK) { // hit stack frame boundary... // wrap the accumulated parse result, this sequence is finished tok->token_type = TT_SEQUENCE; @@ -344,13 +373,15 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* case HCF_END: if(!stream->overrun) goto no_parse; + if(!last_chunk) + goto need_input; h_arena_free(arena, tok); tok = NULL; break; case HCF_CHAR: if(stream->overrun) - goto no_parse; + goto need_input; if(input != x->chr) goto no_parse; tok->token_type = TT_UINT; @@ -359,7 +390,7 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* case HCF_CHARSET: if(stream->overrun) - goto no_parse; + goto need_input; if(!charset_isset(x->charset, input)) goto no_parse; tok->token_type = TT_UINT; @@ -388,16 +419,46 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* h_carray_append(seq, tok); } + // success // since we started with a single nonterminal on the stack, seq should // contain exactly the parse result. assert(seq->used == 1); - h_delete_arena(tarena); - return make_result(arena, seq->elements[0]); + return seq; no_parse: - h_delete_arena(tarena); h_delete_arena(arena); + s->arena = NULL; return NULL; + + need_input: + if(last_chunk) + goto no_parse; + h_arena_free(arena, tok); // no result, yet + h_slist_push(stack, x); // try this symbol again next time + return seq; +} + +static HParseResult *llk_parse_finish_(HAllocator *mm__, HLLkState *s) +{ + HParseResult *res = NULL; + + if(s->seq) { + assert(s->seq->used == 1); + res = make_result(s->arena, s->seq->elements[0]); + } + + h_delete_arena(s->tarena); + h_free(s); + return res; +} + +HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +{ + HLLkState *s = llk_parse_start_(mm__, parser); + + s->seq = llk_parse_chunk_(s, parser, stream, true /* last chunk */); + + return llk_parse_finish_(mm__, s); } diff --git a/src/internal.h b/src/internal.h index 8c79976..fa78181 100644 --- a/src/internal.h +++ b/src/internal.h @@ -225,13 +225,13 @@ typedef struct HParserBackendVTable_ { void (*free)(HParser* parser); void (*parse_start)(HSuspendedParser *s); - // parse_start should allocate backend_state. + // parse_start should allocate s->backend_state. void (*parse_chunk)(HSuspendedParser *s, HInputStream *input); // when parse_chunk leaves input.overrun unset, parse is done. else: - // parse_chunk MUST consume all input, integrating it into backend_state. + // parse_chunk MUST consume all input, integrating it into s->backend_state. // calling parse_chunk again after parse is done should have no effect. HParseResult *(*parse_finish)(HSuspendedParser *s); - // parse_finish must free backend_state. + // parse_finish must free s->backend_state. } HParserBackendVTable; From 0231dc141e6aff819bcc13dd78416414130e178d Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 3 Sep 2015 19:18:07 +0200 Subject: [PATCH 03/21] add iterative API to LL(k) backend --- src/backends/llk.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index af75594..2b8d35b 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -461,12 +461,45 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* return llk_parse_finish_(mm__, s); } +void h_llk_parse_start(HSuspendedParser *s) +{ + s->backend_state = llk_parse_start_(s->mm__, s->parser); +} + +void h_llk_parse_chunk(HSuspendedParser *s, HInputStream *input) +{ + HLLkState *state = s->backend_state; + + state->seq = llk_parse_chunk_(state, s->parser, input, false); +} + +HParseResult *h_llk_parse_finish(HSuspendedParser *s) +{ + HLLkState *state = s->backend_state; + HInputStream empty = { + .index = 0, + .bit_offset = 0, + .overrun = 0, + .endianness = s->endianness, + .length = 0, + .input = NULL + }; + + // signal end of input (no-op parse already done) + state->seq = llk_parse_chunk_(state, s->parser, &empty, true); + + return llk_parse_finish_(s->mm__, s->backend_state); +} HParserBackendVTable h__llk_backend_vtable = { .compile = h_llk_compile, .parse = h_llk_parse, - .free = h_llk_free + .free = h_llk_free, + + .parse_start = h_llk_parse_start, + .parse_chunk = h_llk_parse_chunk, + .parse_finish = h_llk_parse_finish }; From 10fde548adc87e7530bbce267307adcab13b26c8 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Thu, 3 Sep 2015 19:58:39 +0200 Subject: [PATCH 04/21] add a test for iterative parsing --- src/t_parser.c | 32 ++++++++++++++++++++++++++++++++ src/test_suite.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/src/t_parser.c b/src/t_parser.c index df9567e..6ce4845 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -443,6 +443,37 @@ static void test_rightrec(gconstpointer backend) { g_check_parse_match(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "(u0x61 (u0x61 (u0x61)))"); } +static void test_iterative(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + HParser *p; + + p = h_token((uint8_t*)"foobar", 6); + g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "<66.6f.6f.62.61.72>"); + g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "<66.6f.6f.62.61.72>"); + g_check_parse_chunks_failed(p, be, "fou",3, "bar",3); + g_check_parse_chunks_failed(p, be, "foo",3, "par",3); + g_check_parse_chunks_failed(p, be, "foo",3, "baz",3); + + p = h_sequence(h_ch('f'), h_token((uint8_t*)"ooba", 4), h_ch('r'), NULL); + g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "(u0x66 <6f.6f.62.61> u0x72)"); + g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "(u0x66 <6f.6f.62.61> u0x72)"); + g_check_parse_chunks_failed(p, be, "fou",3, "bar",3); + g_check_parse_chunks_failed(p, be, "foo",3, "par",3); + g_check_parse_chunks_failed(p, be, "foo",3, "baz",3); + + p = h_choice(h_token((uint8_t*)"foobar", 6), + h_token((uint8_t*)"foopar", 6), NULL); + g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "<66.6f.6f.62.61.72>"); + g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "<66.6f.6f.62.61.72>"); + g_check_parse_chunks_match(p, be, "foo",3, "par",3, "<66.6f.6f.70.61.72>"); + g_check_parse_chunks_failed(p, be, "fou",3, "bar",3); + g_check_parse_chunks_failed(p, be, "foo",3, "baz",3); + g_check_parse_chunks_match(p, be, "foobar",6, "",0, "<66.6f.6f.62.61.72>"); + g_check_parse_chunks_match(p, be, "",0, "foobar",6, "<66.6f.6f.62.61.72>"); + g_check_parse_chunks_failed(p, be, "foo",3, "",0); + g_check_parse_chunks_failed(p, be, "",0, "foo",3); +} + static void test_ambiguous(gconstpointer backend) { HParser *d_ = h_ch('d'); HParser *p_ = h_ch('+'); @@ -691,6 +722,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/ignore", GINT_TO_POINTER(PB_LLk), test_ignore); //g_test_add_data_func("/core/parser/llk/leftrec", GINT_TO_POINTER(PB_LLk), test_leftrec); g_test_add_data_func("/core/parser/llk/rightrec", GINT_TO_POINTER(PB_LLk), test_rightrec); + g_test_add_data_func("/core/parser/llk/iterative", GINT_TO_POINTER(PB_LLk), test_iterative); g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token); g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch); diff --git a/src/test_suite.h b/src/test_suite.h index 82fe495..37f8f04 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -145,6 +145,49 @@ } \ } while(0) +#define g_check_parse_chunks_failed(parser, backend, chunk1, c1_len, chunk2, c2_len) do { \ + int skip = h_compile((HParser *)(parser), (HParserBackend)backend, NULL); \ + HSuspendedParser *s = h_parse_start(parser); \ + if(skip || !s) { \ + g_test_message("Backend not applicable, skipping test"); \ + break; \ + } \ + h_parse_chunk(s, (const uint8_t*)chunk1, c1_len); \ + h_parse_chunk(s, (const uint8_t*)chunk2, c2_len); \ + const HParseResult *res = h_parse_finish(s); \ + if (NULL != res) { \ + g_test_message("Check failed: shouldn't have succeeded, but did"); \ + g_test_fail(); \ + } \ + } while(0) + +#define g_check_parse_chunks_match(parser, backend, chunk1, c1_len, chunk2, c2_len, result) do { \ + int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \ + HSuspendedParser *s = h_parse_start(parser); \ + if(skip || !s) { \ + g_test_message("Backend not applicable, skipping test"); \ + break; \ + } \ + h_parse_chunk(s, (const uint8_t*)chunk1, c1_len); \ + h_parse_chunk(s, (const uint8_t*)chunk2, c2_len); \ + HParseResult *res = h_parse_finish(s); \ + if (!res) { \ + g_test_message("Parse failed on line %d", __LINE__); \ + g_test_fail(); \ + } else { \ + char* cres = h_write_result_unamb(res->ast); \ + g_check_string(cres, ==, result); \ + (&system_allocator)->free(&system_allocator, cres); \ + HArenaStats stats; \ + h_allocator_stats(res->arena, &stats); \ + g_test_message("Parse used %zd bytes, wasted %zd bytes. " \ + "Inefficiency: %5f%%", \ + stats.used, stats.wasted, \ + stats.wasted * 100. / (stats.used+stats.wasted)); \ + h_delete_arena(res->arena); \ + } \ + } while(0) + #define g_check_hashtable_present(table, key) do { \ if(!h_hashtable_present(table, key)) { \ g_test_message("Check failed: key should have been in table, but wasn't"); \ From 932c9717d7d83f4e1ce2fd51324380018d666e4e Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 12:34:34 +0200 Subject: [PATCH 05/21] oops, forgot about the lookahead --- src/t_parser.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/t_parser.c b/src/t_parser.c index 6ce4845..1fb3220 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -462,16 +462,24 @@ static void test_iterative(gconstpointer backend) { g_check_parse_chunks_failed(p, be, "foo",3, "baz",3); p = h_choice(h_token((uint8_t*)"foobar", 6), - h_token((uint8_t*)"foopar", 6), NULL); + h_token((uint8_t*)"phupar", 6), NULL); g_check_parse_chunks_match(p, be, "foo",3, "bar",3, "<66.6f.6f.62.61.72>"); g_check_parse_chunks_match(p, be, "foo",3, "barbaz",6, "<66.6f.6f.62.61.72>"); - g_check_parse_chunks_match(p, be, "foo",3, "par",3, "<66.6f.6f.70.61.72>"); + g_check_parse_chunks_match(p, be, "phu",3, "par",3, "<70.68.75.70.61.72>"); g_check_parse_chunks_failed(p, be, "fou",3, "bar",3); g_check_parse_chunks_failed(p, be, "foo",3, "baz",3); g_check_parse_chunks_match(p, be, "foobar",6, "",0, "<66.6f.6f.62.61.72>"); g_check_parse_chunks_match(p, be, "",0, "foobar",6, "<66.6f.6f.62.61.72>"); g_check_parse_chunks_failed(p, be, "foo",3, "",0); g_check_parse_chunks_failed(p, be, "",0, "foo",3); + + p = h_sequence(h_ch('f'), h_choice(h_token((uint8_t*)"oo", 2), + h_token((uint8_t*)"uu", 2), NULL), NULL); + g_check_parse_chunks_match(p, be, "f",1, "oo",2, "(u0x66 <6f.6f>)"); + g_check_parse_chunks_match(p, be, "f",1, "uu",2, "(u0x66 <75.75>)"); + g_check_parse_chunks_failed(p, be, "g",1, "oo",2); + g_check_parse_chunks_failed(p, be, "f",1, "ou",2); + g_check_parse_chunks_failed(p, be, "f",1, "uo",2); } static void test_ambiguous(gconstpointer backend) { From d3fd0da7b7bd8d0ab9ad6a8ac43b194895d7b261 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 12:48:10 +0200 Subject: [PATCH 06/21] don't skip tests on compile failure anymore --- src/test_suite.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/test_suite.h b/src/test_suite.h index 37f8f04..76f7e87 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -90,7 +90,8 @@ #define g_check_parse_failed(parser, backend, input, inp_len) do { \ int skip = h_compile((HParser *)(parser), (HParserBackend)backend, NULL); \ if(skip != 0) { \ - g_test_message("Backend not applicable, skipping test"); \ + g_test_message("Compile failed"); \ + g_test_fail(); \ break; \ } \ const HParseResult *result = h_parse(parser, (const uint8_t*)input, inp_len); \ @@ -103,7 +104,8 @@ #define g_check_parse_ok(parser, backend, input, inp_len) do { \ int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \ if(skip) { \ - g_test_message("Backend not applicable, skipping test"); \ + g_test_message("Compile failed"); \ + g_test_fail(); \ break; \ } \ HParseResult *res = h_parse(parser, (const uint8_t*)input, inp_len); \ @@ -124,7 +126,8 @@ #define g_check_parse_match(parser, backend, input, inp_len, result) do { \ int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \ if(skip) { \ - g_test_message("Backend not applicable, skipping test"); \ + g_test_message("Compile failed"); \ + g_test_fail(); \ break; \ } \ HParseResult *res = h_parse(parser, (const uint8_t*)input, inp_len); \ @@ -147,9 +150,15 @@ #define g_check_parse_chunks_failed(parser, backend, chunk1, c1_len, chunk2, c2_len) do { \ int skip = h_compile((HParser *)(parser), (HParserBackend)backend, NULL); \ + if(skip) { \ + g_test_message("Compile failed"); \ + g_test_fail(); \ + break; \ + } \ HSuspendedParser *s = h_parse_start(parser); \ - if(skip || !s) { \ - g_test_message("Backend not applicable, skipping test"); \ + if(!s) { \ + g_test_message("Chunk-wise parsing not available"); \ + g_test_fail(); \ break; \ } \ h_parse_chunk(s, (const uint8_t*)chunk1, c1_len); \ @@ -163,9 +172,15 @@ #define g_check_parse_chunks_match(parser, backend, chunk1, c1_len, chunk2, c2_len, result) do { \ int skip = h_compile((HParser *)(parser), (HParserBackend) backend, NULL); \ + if(skip) { \ + g_test_message("Compile failed"); \ + g_test_fail(); \ + break; \ + } \ HSuspendedParser *s = h_parse_start(parser); \ - if(skip || !s) { \ - g_test_message("Backend not applicable, skipping test"); \ + if(!s) { \ + g_test_message("Chunk-wise parsing not available"); \ + g_test_fail(); \ break; \ } \ h_parse_chunk(s, (const uint8_t*)chunk1, c1_len); \ From 28fa93d4cc52739a1085fb931dd095333f4dc8af Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 12:54:00 +0200 Subject: [PATCH 07/21] make overrun flag a bool to match its usage --- src/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/internal.h b/src/internal.h index fa78181..8c04774 100644 --- a/src/internal.h +++ b/src/internal.h @@ -78,7 +78,7 @@ typedef struct HInputStream_ { char margin; // The number of bits on the end that is being read // towards that should be ignored. char endianness; - char overrun; + bool overrun; } HInputStream; typedef struct HSlistNode_ { From d4f933b2d3fe0664041353f9b8c7a1a978fe3a9d Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 12:55:37 +0200 Subject: [PATCH 08/21] move last_chunk flag into HInputStream --- src/backends/llk.c | 16 +++++++++------- src/hammer.c | 6 ++++-- src/internal.h | 1 + 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 2b8d35b..9acf67e 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -298,7 +298,7 @@ static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) // returns partial result or NULL static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, - HInputStream* stream, bool last_chunk) + HInputStream* stream) { HParsedToken *tok = NULL; // will hold result token HCFChoice *x = NULL; // current symbol (from top of stack) @@ -373,7 +373,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, case HCF_END: if(!stream->overrun) goto no_parse; - if(!last_chunk) + if(!stream->last_chunk) goto need_input; h_arena_free(arena, tok); tok = NULL; @@ -431,7 +431,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, return NULL; need_input: - if(last_chunk) + if(stream->last_chunk) goto no_parse; h_arena_free(arena, tok); // no result, yet h_slist_push(stack, x); // try this symbol again next time @@ -456,7 +456,8 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* { HLLkState *s = llk_parse_start_(mm__, parser); - s->seq = llk_parse_chunk_(s, parser, stream, true /* last chunk */); + assert(stream->last_chunk); + s->seq = llk_parse_chunk_(s, parser, stream); return llk_parse_finish_(mm__, s); } @@ -470,7 +471,7 @@ void h_llk_parse_chunk(HSuspendedParser *s, HInputStream *input) { HLLkState *state = s->backend_state; - state->seq = llk_parse_chunk_(state, s->parser, input, false); + state->seq = llk_parse_chunk_(state, s->parser, input); } HParseResult *h_llk_parse_finish(HSuspendedParser *s) @@ -482,11 +483,12 @@ HParseResult *h_llk_parse_finish(HSuspendedParser *s) .overrun = 0, .endianness = s->endianness, .length = 0, - .input = NULL + .input = NULL, + .last_chunk = true }; // signal end of input (no-op parse already done) - state->seq = llk_parse_chunk_(state, s->parser, &empty, true); + state->seq = llk_parse_chunk_(state, s->parser, &empty); return llk_parse_finish_(s->mm__, s->backend_state); } diff --git a/src/hammer.c b/src/hammer.c index 991008d..3422422 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -56,7 +56,8 @@ HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t* .overrun = 0, .endianness = DEFAULT_ENDIANNESS, .length = length, - .input = input + .input = input, + .last_chunk = true }; return backends[parser->backend]->parse(mm__, parser, &input_stream); @@ -132,7 +133,8 @@ bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length) { .overrun = 0, .endianness = s->endianness, .length = length, - .input = input + .input = input, + .last_chunk = false }; // process chunk diff --git a/src/internal.h b/src/internal.h index 8c04774..b81b50c 100644 --- a/src/internal.h +++ b/src/internal.h @@ -79,6 +79,7 @@ typedef struct HInputStream_ { // towards that should be ignored. char endianness; bool overrun; + bool last_chunk; } HInputStream; typedef struct HSlistNode_ { From 010a1a36ffba7df2814b68c213c71623d4b09ed2 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 16:48:01 +0200 Subject: [PATCH 09/21] int_range test does not work for regex backend --- src/t_parser.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/t_parser.c b/src/t_parser.c index 1fb3220..4b2d5cc 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -743,8 +743,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/uint32", GINT_TO_POINTER(PB_REGULAR), test_uint32); g_test_add_data_func("/core/parser/regex/uint16", GINT_TO_POINTER(PB_REGULAR), test_uint16); g_test_add_data_func("/core/parser/regex/uint8", GINT_TO_POINTER(PB_REGULAR), test_uint8); - g_test_add_data_func("/core/parser/regex/int_range", GINT_TO_POINTER(PB_REGULAR), test_int_range); #if 0 + g_test_add_data_func("/core/parser/regex/int_range", GINT_TO_POINTER(PB_REGULAR), test_int_range); g_test_add_data_func("/core/parser/regex/float64", GINT_TO_POINTER(PB_REGULAR), test_float64); g_test_add_data_func("/core/parser/regex/float32", GINT_TO_POINTER(PB_REGULAR), test_float32); #endif From 127600425054788c121fa4be3831cf09d2c636d5 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 21:05:56 +0200 Subject: [PATCH 10/21] handle suspend on lookahead at the very end of the chunk --- src/backends/llk.c | 27 ++++++++++++++++----------- src/cfgrammar.c | 11 ++++++++--- src/cfgrammar.h | 3 +++ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 9acf67e..9528997 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -296,7 +296,7 @@ static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) return s; } -// returns partial result or NULL +// returns partial result or NULL (no parse) static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, HInputStream* stream) { @@ -316,6 +316,8 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // when we empty the stack, the parse is complete. while(!h_slist_empty(stack)) { + tok = NULL; + // pop top of stack for inspection x = h_slist_pop(stack); assert(x != NULL); @@ -323,6 +325,16 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, if(x != MARK && x->type == HCF_CHOICE) { // x is a nonterminal; apply the appropriate production and continue + // look up applicable production in parse table + const HCFSequence *p = h_llk_lookup(table, x, stream); + if(p == NULL) + goto no_parse; + if(p == H_NEED_INPUT) + goto need_input; + + // an infinite loop case that shouldn't happen + assert(!p->items[0] || p->items[0] != x); + // push stack frame h_slist_push(stack, seq); // save current partial value h_slist_push(stack, x); // save the nonterminal @@ -331,14 +343,6 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // open a fresh result sequence seq = h_carray_new(arena); - // look up applicable production in parse table - const HCFSequence *p = h_llk_lookup(table, x, stream); - if(p == NULL) - goto no_parse; - - // an infinite loop case that shouldn't happen - assert(!p->items[0] || p->items[0] != x); - // push production's rhs onto the stack (in reverse order) HCFChoice **s; for(s = p->items; *s; s++); @@ -433,8 +437,9 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, need_input: if(stream->last_chunk) goto no_parse; - h_arena_free(arena, tok); // no result, yet - h_slist_push(stack, x); // try this symbol again next time + if(tok) + h_arena_free(arena, tok); // no result, yet + h_slist_push(stack, x); // try this symbol again next time return seq; } diff --git a/src/cfgrammar.c b/src/cfgrammar.c index a8761b8..117009a 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -349,6 +349,7 @@ void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool en return m->epsilon_branch; } +// A NULL result means no parse. H_NEED_INPUT means lookahead is too short. void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead) { while(m) { @@ -362,9 +363,13 @@ void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead) // reading bits from it does not consume them from the real input. uint8_t c = h_read_bits(&lookahead, 8, false); - if (lookahead.overrun) { // end of input - // XXX assumption of byte-wise grammar and input - return m->end_branch; + if (lookahead.overrun) { // end of chunk + if (lookahead.last_chunk) { // end of input + // XXX assumption of byte-wise grammar and input + return m->end_branch; + } else { + return H_NEED_INPUT; + } } // no match yet, descend diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 9cefc62..2294d44 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -56,6 +56,9 @@ bool h_stringmap_empty(const HStringMap *m); static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_t c) { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } +// dummy return value used by h_stringmap_get_lookahead when out of input +#define H_NEED_INPUT ((void *)&h_stringmap_get_lookahead) + /* Convert 'parser' into CFG representation by desugaring and compiling the set * of nonterminals. From 90b6f30fa1dc47296717dc7548ccb10cbb40a62f Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Fri, 4 Sep 2015 21:06:50 +0200 Subject: [PATCH 11/21] add a test that needs to suspend on lookahead in the middle of a chunk --- src/t_parser.c | 22 ++++++++++++++++++++++ src/test_suite.h | 8 ++++++++ 2 files changed, 30 insertions(+) diff --git a/src/t_parser.c b/src/t_parser.c index 4b2d5cc..666bc85 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -482,6 +482,27 @@ static void test_iterative(gconstpointer backend) { g_check_parse_chunks_failed(p, be, "f",1, "uo",2); } +static void test_iterative_lookahead(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + HParser *p; + + // needs 2 lookahead + p = h_sequence(h_ch('f'), h_choice(h_token((uint8_t*)"oo", 2), + h_token((uint8_t*)"ou", 2), NULL), NULL); + if(h_compile(p, be, (void *)2) != 0) { + g_test_message("Compile failed"); + g_test_fail(); + return; + } + + // partial chunk consumed + g_check_parse_chunks_match_(p, "fo",2, "o",1, "(u0x66 <6f.6f>)"); + g_check_parse_chunks_match_(p, "fo",2, "u",1, "(u0x66 <6f.75>)"); + g_check_parse_chunks_failed_(p, "go",2, "o",1); + g_check_parse_chunks_failed_(p, "fa",2, "u",1); + g_check_parse_chunks_failed_(p, "fo",2, "b",1); +} + static void test_ambiguous(gconstpointer backend) { HParser *d_ = h_ch('d'); HParser *p_ = h_ch('+'); @@ -731,6 +752,7 @@ void register_parser_tests(void) { //g_test_add_data_func("/core/parser/llk/leftrec", GINT_TO_POINTER(PB_LLk), test_leftrec); g_test_add_data_func("/core/parser/llk/rightrec", GINT_TO_POINTER(PB_LLk), test_rightrec); g_test_add_data_func("/core/parser/llk/iterative", GINT_TO_POINTER(PB_LLk), test_iterative); + g_test_add_data_func("/core/parser/llk/iterative/lookahead", GINT_TO_POINTER(PB_LLk), test_iterative_lookahead); g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token); g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch); diff --git a/src/test_suite.h b/src/test_suite.h index 76f7e87..49f13cf 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -155,6 +155,10 @@ g_test_fail(); \ break; \ } \ + g_check_parse_chunks_failed_(parser, chunk1, c1_len, chunk2, c2_len); \ + } while(0) + +#define g_check_parse_chunks_failed_(parser, chunk1, c1_len, chunk2, c2_len) do { \ HSuspendedParser *s = h_parse_start(parser); \ if(!s) { \ g_test_message("Chunk-wise parsing not available"); \ @@ -177,6 +181,10 @@ g_test_fail(); \ break; \ } \ + g_check_parse_chunks_match_(parser, chunk1, c1_len, chunk2, c2_len, result); \ + } while(0) + +#define g_check_parse_chunks_match_(parser, chunk1, c1_len, chunk2, c2_len, result) do { \ HSuspendedParser *s = h_parse_start(parser); \ if(!s) { \ g_test_message("Chunk-wise parsing not available"); \ From 8995097a1d5fc9ce837252ec2829125311cc3236 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sat, 5 Sep 2015 22:43:49 +0200 Subject: [PATCH 12/21] let's just use ((void *)-1) as another invalid pointer result --- src/backends/llk.c | 5 ++--- src/cfgrammar.c | 4 ++-- src/cfgrammar.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 9528997..b095998 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -276,8 +276,7 @@ typedef struct { // execute on their corresponding result. // also on the stack below the mark, we store the previously accumulated // value for the surrounding production. -static int dummy; -static void *MARK = &dummy; // stack frame delimiter +static void *MARK = (void *)-1; // stack frame delimiter static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) { @@ -329,7 +328,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, const HCFSequence *p = h_llk_lookup(table, x, stream); if(p == NULL) goto no_parse; - if(p == H_NEED_INPUT) + if(p == NEED_INPUT) goto need_input; // an infinite loop case that shouldn't happen diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 117009a..a7601e1 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -349,7 +349,7 @@ void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool en return m->epsilon_branch; } -// A NULL result means no parse. H_NEED_INPUT means lookahead is too short. +// A NULL result means no parse. NEED_INPUT means lookahead is too short. void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead) { while(m) { @@ -368,7 +368,7 @@ void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead) // XXX assumption of byte-wise grammar and input return m->end_branch; } else { - return H_NEED_INPUT; + return NEED_INPUT; } } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 2294d44..1e18442 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -57,7 +57,7 @@ static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_ { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } // dummy return value used by h_stringmap_get_lookahead when out of input -#define H_NEED_INPUT ((void *)&h_stringmap_get_lookahead) +#define NEED_INPUT ((void *)-1) /* Convert 'parser' into CFG representation by desugaring and compiling the set From 73e92f04df78dff746950376f92d9711237e406c Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sun, 6 Sep 2015 14:48:41 +0200 Subject: [PATCH 13/21] save kmax in HLLkTable --- src/backends/llk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backends/llk.c b/src/backends/llk.c index b095998..f5ab7bb 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -12,6 +12,7 @@ static const size_t DEFAULT_KMAX = 1; * maps lookahead strings to productions (HCFSequence). */ typedef struct HLLkTable_ { + size_t kmax; HHashTable *rows; HCFChoice *start; // start symbol HArena *arena; @@ -188,6 +189,7 @@ static int fill_table_row(size_t kmax, HCFGrammar *g, HStringMap *row, */ static int fill_table(size_t kmax, HCFGrammar *g, HLLkTable *table) { + table->kmax = kmax; table->start = g->start; // iterate over g->nts From bfb795b093c117510b5bc9d1c532a458400ed7ac Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Sun, 6 Sep 2015 14:48:51 +0200 Subject: [PATCH 14/21] add window buffer to look across chunk boundaries --- src/backends/llk.c | 90 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index f5ab7bb..1cfaab8 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -266,6 +266,13 @@ typedef struct { HArena *tarena; // tmp, deleted after parse HSlist *stack; HCountedArray *seq; // accumulates current parse result + + uint8_t *buf; // for lookahead across chunk boundaries + // allocated to size 2*kmax + // new chunk starts at index kmax + // ( 0 ... kmax ... 2*kmax-1 ) + // \_old_/\______new_______/ + HInputStream win; // win.length is set to 0 when not in use } HLLkState; // in order to construct the parse tree, we delimit the symbol stack into @@ -290,6 +297,10 @@ static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) s->tarena = h_new_arena(mm__, 0); s->stack = h_slist_new(s->tarena); s->seq = h_carray_new(s->arena); + s->buf = h_arena_malloc(s->tarena, 2 * table->kmax); + + s->win.input = s->buf; + s->win.length = 0; // unused // initialize with the start symbol on the stack. h_slist_push(s->stack, table->start); @@ -297,6 +308,73 @@ static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) return s; } +// helper: add new input to the lookahead window +static void append_win(size_t kmax, HLLkState *s, HInputStream *stream) +{ + assert(stream->bit_offset == 0); + assert(s->win.input == s->buf); + assert(s->win.length == kmax); + assert(s->win.index < kmax); + + size_t n = stream->length - stream->index; // bytes to copy + if(n > kmax) + n = kmax; + + memcpy(s->buf + kmax, stream->input + stream->index, n); + s->win.length += n; +} + +// helper: save old input to the lookahead window +static void save_win(size_t kmax, HLLkState *s, HInputStream *stream) +{ + assert(stream->bit_offset == 0); + + size_t len = stream->length - stream->index; + assert(len < kmax); + + if(len == 0) { + // stream empty? nothing to do. + return; + } else if(s->win.length > 0) { + // window active? should contain all of stream. + assert(s->win.length == kmax + len); + assert(s->win.index <= kmax); + + // shift contents down: + // + // (0 kmax ) + // ... \_old_/\_new_/ ... + // + // (0 kmax ) + // ... \_old_/\_new_/ ... + // + len = s->win.length - s->win.index; + memmove(s->buf + kmax - len, s->buf + s->win.index, len); + } else { + // window not active? save stream to window. + memcpy(s->buf + kmax - len, stream->input + stream->index, len); + } + + // metadata + s->win = *stream; + s->win.input = s->buf; + s->win.index = kmax - len; + s->win.length = kmax; +} + +// helper: read from window until old chunk gone, then switch to stream +static uint8_t consume_input(size_t kmax, HLLkState *s, HInputStream *stream) +{ + if(s->win.length > 0) { + uint8_t b = h_read_bits(&s->win, 8, false); + if(s->win.index >= kmax) // old chunk consumed! + s->win.length = 0; // disable the window + return b; + } else { + return h_read_bits(stream, 8, false); + } +} + // returns partial result or NULL (no parse) static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, HInputStream* stream) @@ -315,6 +393,9 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, if(!seq) return NULL; // parse already failed + if(s->win.length > 0) + append_win(table->kmax, s, stream); + // when we empty the stack, the parse is complete. while(!h_slist_empty(stack)) { tok = NULL; @@ -327,11 +408,14 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // x is a nonterminal; apply the appropriate production and continue // look up applicable production in parse table - const HCFSequence *p = h_llk_lookup(table, x, stream); + HInputStream *lookup_stream = s->win.length > 0 ? &s->win : stream; + const HCFSequence *p = h_llk_lookup(table, x, lookup_stream); if(p == NULL) goto no_parse; - if(p == NEED_INPUT) + if(p == NEED_INPUT) { + save_win(table->kmax, s, stream); goto need_input; + } // an infinite loop case that shouldn't happen assert(!p->items[0] || p->items[0] != x); @@ -372,7 +456,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // x is a terminal or simple charset; match against input // consume the input token - uint8_t input = h_read_bits(stream, 8, false); + uint8_t input = consume_input(table->kmax, s, stream); switch(x->type) { case HCF_END: From 29434869d538211c62712a0c55f7f1068d564671 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 7 Sep 2015 11:05:06 +0200 Subject: [PATCH 15/21] add result_length test --- src/t_parser.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/t_parser.c b/src/t_parser.c index 666bc85..a8bdb0d 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -503,6 +503,55 @@ static void test_iterative_lookahead(gconstpointer backend) { g_check_parse_chunks_failed_(p, "fo",2, "b",1); } +static void test_iterative_result_length(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + HParser *p = h_token((uint8_t*)"foobar", 6); + + if(h_compile(p, be, NULL) != 0) { + g_test_message("Compile failed"); + g_test_fail(); + return; + } + + HSuspendedParser *s = h_parse_start(p); + if(!s) { + g_test_message("Chunked parsing not available"); + g_test_fail(); + return; + } + h_parse_chunk(s, (uint8_t*)"foo", 3); + h_parse_chunk(s, (uint8_t*)"ba", 2); + h_parse_chunk(s, (uint8_t*)"rbaz", 4); + HParseResult *r = h_parse_finish(s); + if(!r) { + g_test_message("Parse failed"); + g_test_fail(); + return; + } + + g_check_cmp_int64(r->bit_length, ==, 48); +} + +static void test_result_length(gconstpointer backend) { + HParserBackend be = (HParserBackend)GPOINTER_TO_INT(backend); + HParser *p = h_token((uint8_t*)"foo", 3); + + if(h_compile(p, be, NULL) != 0) { + g_test_message("Compile failed"); + g_test_fail(); + return; + } + + HParseResult *r = h_parse(p, (uint8_t*)"foobar", 6); + if(!r) { + g_test_message("Parse failed"); + g_test_fail(); + return; + } + + g_check_cmp_int64(r->bit_length, ==, 24); +} + static void test_ambiguous(gconstpointer backend) { HParser *d_ = h_ch('d'); HParser *p_ = h_ch('+'); @@ -713,6 +762,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/putget", GINT_TO_POINTER(PB_PACKRAT), test_put_get); g_test_add_data_func("/core/parser/packrat/permutation", GINT_TO_POINTER(PB_PACKRAT), test_permutation); g_test_add_data_func("/core/parser/packrat/bind", GINT_TO_POINTER(PB_PACKRAT), test_bind); + g_test_add_data_func("/core/parser/packrat/result_length", GINT_TO_POINTER(PB_PACKRAT), test_result_length); g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token); g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch); @@ -751,8 +801,10 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/ignore", GINT_TO_POINTER(PB_LLk), test_ignore); //g_test_add_data_func("/core/parser/llk/leftrec", GINT_TO_POINTER(PB_LLk), test_leftrec); g_test_add_data_func("/core/parser/llk/rightrec", GINT_TO_POINTER(PB_LLk), test_rightrec); + g_test_add_data_func("/core/parser/llk/result_length", GINT_TO_POINTER(PB_LLk), test_result_length); g_test_add_data_func("/core/parser/llk/iterative", GINT_TO_POINTER(PB_LLk), test_iterative); g_test_add_data_func("/core/parser/llk/iterative/lookahead", GINT_TO_POINTER(PB_LLk), test_iterative_lookahead); + g_test_add_data_func("/core/parser/llk/iterative/result_length", GINT_TO_POINTER(PB_LLk), test_iterative_result_length); g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token); g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch); @@ -790,6 +842,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/epsilon_p", GINT_TO_POINTER(PB_REGULAR), test_epsilon_p); g_test_add_data_func("/core/parser/regex/attr_bool", GINT_TO_POINTER(PB_REGULAR), test_attr_bool); g_test_add_data_func("/core/parser/regex/ignore", GINT_TO_POINTER(PB_REGULAR), test_ignore); + g_test_add_data_func("/core/parser/regex/result_length", GINT_TO_POINTER(PB_REGULAR), test_result_length); g_test_add_data_func("/core/parser/lalr/token", GINT_TO_POINTER(PB_LALR), test_token); g_test_add_data_func("/core/parser/lalr/ch", GINT_TO_POINTER(PB_LALR), test_ch); @@ -829,6 +882,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/lalr/leftrec", GINT_TO_POINTER(PB_LALR), test_leftrec); g_test_add_data_func("/core/parser/lalr/leftrec-ne", GINT_TO_POINTER(PB_LALR), test_leftrec_ne); g_test_add_data_func("/core/parser/lalr/rightrec", GINT_TO_POINTER(PB_LALR), test_rightrec); + g_test_add_data_func("/core/parser/lalr/result_length", GINT_TO_POINTER(PB_LALR), test_result_length); g_test_add_data_func("/core/parser/glr/token", GINT_TO_POINTER(PB_GLR), test_token); g_test_add_data_func("/core/parser/glr/ch", GINT_TO_POINTER(PB_GLR), test_ch); @@ -869,4 +923,5 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/leftrec-ne", GINT_TO_POINTER(PB_GLR), test_leftrec_ne); g_test_add_data_func("/core/parser/glr/rightrec", GINT_TO_POINTER(PB_GLR), test_rightrec); g_test_add_data_func("/core/parser/glr/ambiguous", GINT_TO_POINTER(PB_GLR), test_ambiguous); + g_test_add_data_func("/core/parser/glr/result_length", GINT_TO_POINTER(PB_GLR), test_result_length); } From 4f455aa97e41d51ad49ed43fa853190659797b3b Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 7 Sep 2015 15:33:50 +0200 Subject: [PATCH 16/21] fix result bit_length for LL(k) --- src/backends/llk.c | 53 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 1cfaab8..ec5f7f7 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -266,6 +266,7 @@ typedef struct { HArena *tarena; // tmp, deleted after parse HSlist *stack; HCountedArray *seq; // accumulates current parse result + size_t index; // input position in bytes uint8_t *buf; // for lookahead across chunk boundaries // allocated to size 2*kmax @@ -297,6 +298,7 @@ static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) s->tarena = h_new_arena(mm__, 0); s->stack = h_slist_new(s->tarena); s->seq = h_carray_new(s->arena); + s->index = 0; s->buf = h_arena_malloc(s->tarena, 2 * table->kmax); s->win.input = s->buf; @@ -348,10 +350,13 @@ static void save_win(size_t kmax, HLLkState *s, HInputStream *stream) // (0 kmax ) // ... \_old_/\_new_/ ... // + s->index += len; // position of the window shifts up len = s->win.length - s->win.index; + assert(len <= kmax); memmove(s->buf + kmax - len, s->buf + s->win.index, len); } else { // window not active? save stream to window. + s->index -= kmax; // window starts kmax bytes below next chunk memcpy(s->buf + kmax - len, stream->input + stream->index, len); } @@ -362,25 +367,16 @@ static void save_win(size_t kmax, HLLkState *s, HInputStream *stream) s->win.length = kmax; } -// helper: read from window until old chunk gone, then switch to stream -static uint8_t consume_input(size_t kmax, HLLkState *s, HInputStream *stream) -{ - if(s->win.length > 0) { - uint8_t b = h_read_bits(&s->win, 8, false); - if(s->win.index >= kmax) // old chunk consumed! - s->win.length = 0; // disable the window - return b; - } else { - return h_read_bits(stream, 8, false); - } -} - // returns partial result or NULL (no parse) static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, - HInputStream* stream) + HInputStream* chunk) { HParsedToken *tok = NULL; // will hold result token HCFChoice *x = NULL; // current symbol (from top of stack) + HInputStream *stream; + + assert(chunk->index == 0); + assert(chunk->bit_offset == 0); const HLLkTable *table = parser->backend_data; assert(table != NULL); @@ -389,12 +385,17 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, HArena *tarena = s->tarena; HSlist *stack = s->stack; HCountedArray *seq = s->seq; + size_t kmax = table->kmax; if(!seq) return NULL; // parse already failed - if(s->win.length > 0) - append_win(table->kmax, s, stream); + if(s->win.length > 0) { + append_win(kmax, s, chunk); + stream = &s->win; + } else { + stream = chunk; + } // when we empty the stack, the parse is complete. while(!h_slist_empty(stack)) { @@ -408,12 +409,11 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // x is a nonterminal; apply the appropriate production and continue // look up applicable production in parse table - HInputStream *lookup_stream = s->win.length > 0 ? &s->win : stream; - const HCFSequence *p = h_llk_lookup(table, x, lookup_stream); + const HCFSequence *p = h_llk_lookup(table, x, stream); if(p == NULL) goto no_parse; if(p == NEED_INPUT) { - save_win(table->kmax, s, stream); + save_win(kmax, s, chunk); goto need_input; } @@ -439,7 +439,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // the top of stack is such that there will be a result... tok = h_arena_malloc(arena, sizeof(HParsedToken)); - tok->index = stream->index; + tok->index = s->index + stream->index; tok->bit_offset = stream->bit_offset; if(x == MARK) { // hit stack frame boundary... @@ -456,7 +456,14 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // x is a terminal or simple charset; match against input // consume the input token - uint8_t input = consume_input(table->kmax, s, stream); + uint8_t input = h_read_bits(stream, 8, false); + + // when old chunk consumed from window, switch to new chunk + if(s->win.length > 0 && s->win.index >= kmax) { + s->win.length = 0; // disable the window + s->index += kmax; // new chunk starts kmax bytes above the window + stream = chunk; + } switch(x->type) { case HCF_END: @@ -512,11 +519,13 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // since we started with a single nonterminal on the stack, seq should // contain exactly the parse result. assert(seq->used == 1); + s->index += stream->index; return seq; no_parse: h_delete_arena(arena); s->arena = NULL; + s->index += stream->index; return NULL; need_input: @@ -525,6 +534,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, if(tok) h_arena_free(arena, tok); // no result, yet h_slist_push(stack, x); // try this symbol again next time + s->index += stream->index; return seq; } @@ -535,6 +545,7 @@ static HParseResult *llk_parse_finish_(HAllocator *mm__, HLLkState *s) if(s->seq) { assert(s->seq->used == 1); res = make_result(s->arena, s->seq->elements[0]); + res->bit_length = s->index*8; } h_delete_arena(s->tarena); From fb5122ec88dc665be4c07fe65d617490026adbe0 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Mon, 7 Sep 2015 15:37:26 +0200 Subject: [PATCH 17/21] fix result bit_length for LR backends --- src/backends/lr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backends/lr.c b/src/backends/lr.c index e7f2377..59c8c90 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -351,7 +351,9 @@ HParseResult *h_lrengine_result(HLREngine *engine) // on top of the stack is the start symbol's semantic value assert(!h_slist_empty(engine->stack)); HParsedToken *tok = engine->stack->head->elem; - return make_result(engine->arena, tok); + HParseResult *res = make_result(engine->arena, tok); + res->bit_length = engine->input.index * 8; + return res; } else { return NULL; } From 5b4550ab0fb6ef28c9baa79db59ec789b998cf47 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Tue, 8 Sep 2015 16:19:45 +0200 Subject: [PATCH 18/21] make MARK a constant that points to itself --- src/backends/llk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index ec5f7f7..898bfdc 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -286,7 +286,7 @@ typedef struct { // execute on their corresponding result. // also on the stack below the mark, we store the previously accumulated // value for the surrounding production. -static void *MARK = (void *)-1; // stack frame delimiter +static void const * const MARK = &MARK; // stack frame delimiter static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) { @@ -421,9 +421,9 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, assert(!p->items[0] || p->items[0] != x); // push stack frame - h_slist_push(stack, seq); // save current partial value - h_slist_push(stack, x); // save the nonterminal - h_slist_push(stack, MARK); // frame delimiter + h_slist_push(stack, seq); // save current partial value + h_slist_push(stack, x); // save the nonterminal + h_slist_push(stack, (void *)MARK); // frame delimiter // open a fresh result sequence seq = h_carray_new(arena); From 42d35fb883a164ca3b47928a488b59a766c37b7a Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 16 Sep 2015 23:25:36 +0200 Subject: [PATCH 19/21] move chunk position into HInputStream and simplify internal chunk API --- src/backends/llk.c | 38 ++++++++++++-------------------------- src/hammer.c | 39 ++++++++++++++++++++++++++++++++++++--- src/internal.h | 13 +++++++++---- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index 898bfdc..865c30e 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -266,7 +266,6 @@ typedef struct { HArena *tarena; // tmp, deleted after parse HSlist *stack; HCountedArray *seq; // accumulates current parse result - size_t index; // input position in bytes uint8_t *buf; // for lookahead across chunk boundaries // allocated to size 2*kmax @@ -298,7 +297,6 @@ static HLLkState *llk_parse_start_(HAllocator* mm__, const HParser* parser) s->tarena = h_new_arena(mm__, 0); s->stack = h_slist_new(s->tarena); s->seq = h_carray_new(s->arena); - s->index = 0; s->buf = h_arena_malloc(s->tarena, 2 * table->kmax); s->win.input = s->buf; @@ -350,13 +348,14 @@ static void save_win(size_t kmax, HLLkState *s, HInputStream *stream) // (0 kmax ) // ... \_old_/\_new_/ ... // - s->index += len; // position of the window shifts up + s->win.pos += len; // position of the window shifts up len = s->win.length - s->win.index; assert(len <= kmax); memmove(s->buf + kmax - len, s->buf + s->win.index, len); } else { // window not active? save stream to window. - s->index -= kmax; // window starts kmax bytes below next chunk + // buffer starts kmax bytes below chunk boundary + s->win.pos = stream->pos - kmax; memcpy(s->buf + kmax - len, stream->input + stream->index, len); } @@ -439,7 +438,7 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // the top of stack is such that there will be a result... tok = h_arena_malloc(arena, sizeof(HParsedToken)); - tok->index = s->index + stream->index; + tok->index = stream->pos + stream->index; tok->bit_offset = stream->bit_offset; if(x == MARK) { // hit stack frame boundary... @@ -461,7 +460,6 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // when old chunk consumed from window, switch to new chunk if(s->win.length > 0 && s->win.index >= kmax) { s->win.length = 0; // disable the window - s->index += kmax; // new chunk starts kmax bytes above the window stream = chunk; } @@ -519,13 +517,11 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, // since we started with a single nonterminal on the stack, seq should // contain exactly the parse result. assert(seq->used == 1); - s->index += stream->index; return seq; no_parse: h_delete_arena(arena); s->arena = NULL; - s->index += stream->index; return NULL; need_input: @@ -534,7 +530,6 @@ static HCountedArray *llk_parse_chunk_(HLLkState *s, const HParser* parser, if(tok) h_arena_free(arena, tok); // no result, yet h_slist_push(stack, x); // try this symbol again next time - s->index += stream->index; return seq; } @@ -545,7 +540,6 @@ static HParseResult *llk_parse_finish_(HAllocator *mm__, HLLkState *s) if(s->seq) { assert(s->seq->used == 1); res = make_result(s->arena, s->seq->elements[0]); - res->bit_length = s->index*8; } h_delete_arena(s->tarena); @@ -560,7 +554,11 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* assert(stream->last_chunk); s->seq = llk_parse_chunk_(s, parser, stream); - return llk_parse_finish_(mm__, s); + HParseResult *res = llk_parse_finish_(mm__, s); + if(res) + res->bit_length = stream->index * 8 + stream->bit_offset; + + return res; } void h_llk_parse_start(HSuspendedParser *s) @@ -568,29 +566,17 @@ void h_llk_parse_start(HSuspendedParser *s) s->backend_state = llk_parse_start_(s->mm__, s->parser); } -void h_llk_parse_chunk(HSuspendedParser *s, HInputStream *input) +bool h_llk_parse_chunk(HSuspendedParser *s, HInputStream *input) { HLLkState *state = s->backend_state; state->seq = llk_parse_chunk_(state, s->parser, input); + + return (state->seq == NULL || h_slist_empty(state->stack)); } HParseResult *h_llk_parse_finish(HSuspendedParser *s) { - HLLkState *state = s->backend_state; - HInputStream empty = { - .index = 0, - .bit_offset = 0, - .overrun = 0, - .endianness = s->endianness, - .length = 0, - .input = NULL, - .last_chunk = true - }; - - // signal end of input (no-op parse already done) - state->seq = llk_parse_chunk_(state, s->parser, &empty); - return llk_parse_finish_(s->mm__, s->backend_state); } diff --git a/src/hammer.c b/src/hammer.c index 3422422..70ebc8a 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -51,6 +51,7 @@ HParseResult* h_parse(const HParser* parser, const uint8_t* input, size_t length HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t* input, size_t length) { // Set up a parse state... HInputStream input_stream = { + .pos = 0, .index = 0, .bit_offset = 0, .overrun = 0, @@ -114,6 +115,9 @@ HSuspendedParser* h_parse_start__m(HAllocator* mm__, const HParser* parser) { s->mm__ = mm__; s->parser = parser; s->backend_state = NULL; + s->done = false; + s->pos = 0; + s->bit_offset = 0; s->endianness = DEFAULT_ENDIANNESS; // backend-specific initialization @@ -126,8 +130,13 @@ HSuspendedParser* h_parse_start__m(HAllocator* mm__, const HParser* parser) { bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length) { assert(backends[s->parser->backend]->parse_chunk != NULL); + // no-op if parser is already done + if(s->done) + return true; + // input HInputStream input_stream = { + .pos = s->pos, .index = 0, .bit_offset = 0, .overrun = 0, @@ -138,19 +147,43 @@ bool h_parse_chunk(HSuspendedParser* s, const uint8_t* input, size_t length) { }; // process chunk - backends[s->parser->backend]->parse_chunk(s, &input_stream); + s->done = backends[s->parser->backend]->parse_chunk(s, &input_stream); s->endianness = input_stream.endianness; + s->pos += input_stream.index; + s->bit_offset = input_stream.bit_offset; - return !input_stream.overrun; // parser wants no more input? done. + return s->done; } HParseResult* h_parse_finish(HSuspendedParser* s) { + assert(backends[s->parser->backend]->parse_chunk != NULL); assert(backends[s->parser->backend]->parse_finish != NULL); HAllocator *mm__ = s->mm__; + // signal end of input if parser is not already done + if(!s->done) { + HInputStream empty = { + .pos = s->pos, + .index = 0, + .bit_offset = 0, + .overrun = 0, + .endianness = s->endianness, + .length = 0, + .input = NULL, + .last_chunk = true + }; + + s->done = backends[s->parser->backend]->parse_chunk(s, &empty); + assert(s->done); + } + + // extract result HParseResult *r = backends[s->parser->backend]->parse_finish(s); - // NB: backend should have freed backend_state + if(r) + r->bit_length = s->pos * 8 + s->bit_offset; + + // NB: backend should have freed backend_state h_free(s); return r; diff --git a/src/internal.h b/src/internal.h index b81b50c..b11186d 100644 --- a/src/internal.h +++ b/src/internal.h @@ -72,6 +72,7 @@ typedef struct HCFStack_ HCFStack; typedef struct HInputStream_ { // This should be considered to be a really big value type. const uint8_t *input; + size_t pos; // position of this chunk in a multi-chunk stream size_t index; size_t length; char bit_offset; @@ -215,8 +216,11 @@ struct HSuspendedParser_ { HAllocator *mm__; const HParser *parser; void *backend_state; + bool done; - // the only part of HInputStream that carries across chunks + // input stream state + size_t pos; + uint8_t bit_offset; uint8_t endianness; }; @@ -227,12 +231,13 @@ typedef struct HParserBackendVTable_ { void (*parse_start)(HSuspendedParser *s); // parse_start should allocate s->backend_state. - void (*parse_chunk)(HSuspendedParser *s, HInputStream *input); - // when parse_chunk leaves input.overrun unset, parse is done. else: + bool (*parse_chunk)(HSuspendedParser *s, HInputStream *input); + // if parser is done, return true. otherwise: // parse_chunk MUST consume all input, integrating it into s->backend_state. - // calling parse_chunk again after parse is done should have no effect. + // parse_chunk will not be called again after it reports done. HParseResult *(*parse_finish)(HSuspendedParser *s); // parse_finish must free s->backend_state. + // parse_finish will not be called before parse_chunk reports done. } HParserBackendVTable; From ff55937e0089d0b8e267f796cd7c4fc369343e7f Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 16 Sep 2015 23:42:18 +0200 Subject: [PATCH 20/21] split out h_pprint_lr_info for debugging purposes --- src/backends/lalr.c | 36 +++++++----------------------------- src/backends/lr.c | 32 ++++++++++++++++++++++++++++++++ src/backends/lr.h | 1 + 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 272f00d..ec7a5b5 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -355,8 +355,6 @@ HParserBackendVTable h__lalr_backend_vtable = { // dummy! int test_lalr(void) { - HAllocator *mm__ = &system_allocator; - /* E -> E '-' T | T @@ -371,44 +369,24 @@ int test_lalr(void) h_bind_indirect(E, E_); HParser *p = E; - printf("\n==== G R A M M A R ====\n"); - HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p)); - if (g == NULL) { - fprintf(stderr, "h_cfgrammar failed\n"); + HCFGrammar *g = h_pprint_lr_info(stdout, p); + if(!g) return 1; - } - h_pprint_grammar(stdout, g, 0); - printf("\n==== D F A ====\n"); - HLRDFA *dfa = h_lr0_dfa(g); - if (dfa) { - h_pprint_lrdfa(stdout, g, dfa, 0); - } else { - fprintf(stderr, "h_lalr_dfa failed\n"); - } - - printf("\n==== L R ( 0 ) T A B L E ====\n"); - HLRTable *table0 = h_lr0_table(g, dfa); - if (table0) { - h_pprint_lrtable(stdout, g, table0, 0); - } else { - fprintf(stderr, "h_lr0_table failed\n"); - } - h_lrtable_free(table0); - - printf("\n==== L A L R T A B L E ====\n"); + fprintf(stdout, "\n==== L A L R T A B L E ====\n"); if (h_compile(p, PB_LALR, NULL)) { - fprintf(stderr, "does not compile\n"); + fprintf(stdout, "does not compile\n"); return 2; } h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); - printf("\n==== P A R S E R E S U L T ====\n"); + fprintf(stdout, "\n==== P A R S E R E S U L T ====\n"); HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); if (res) { h_pprint(stdout, res->ast, 0, 2); } else { - printf("no parse\n"); + fprintf(stdout, "no parse\n"); } + return 0; } diff --git a/src/backends/lr.c b/src/backends/lr.c index 59c8c90..161bbf9 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -538,3 +538,35 @@ void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, fputc('\n', f); #endif } + +HCFGrammar *h_pprint_lr_info(FILE *f, HParser *p) +{ + HAllocator *mm__ = &system_allocator; + + fprintf(f, "\n==== G R A M M A R ====\n"); + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p)); + if (g == NULL) { + fprintf(f, "h_cfgrammar failed\n"); + return NULL; + } + h_pprint_grammar(f, g, 0); + + fprintf(f, "\n==== D F A ====\n"); + HLRDFA *dfa = h_lr0_dfa(g); + if (dfa) { + h_pprint_lrdfa(f, g, dfa, 0); + } else { + fprintf(f, "h_lalr_dfa failed\n"); + } + + fprintf(f, "\n==== L R ( 0 ) T A B L E ====\n"); + HLRTable *table0 = h_lr0_table(g, dfa); + if (table0) { + h_pprint_lrtable(f, g, table0, 0); + } else { + fprintf(f, "h_lr0_table failed\n"); + } + h_lrtable_free(table0); + + return g; +} diff --git a/src/backends/lr.h b/src/backends/lr.h index 8f1eadd..f9cb9a2 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -143,5 +143,6 @@ void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, const HLRDFA *dfa, unsigned int indent); void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, unsigned int indent); +HCFGrammar *h_pprint_lr_info(FILE *f, HParser *p); #endif From caf00006f33a7cd6a35cd051add147bad4f33d8a Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" Date: Wed, 16 Sep 2015 23:43:30 +0200 Subject: [PATCH 21/21] add iterative API to LALR backend --- src/backends/lalr.c | 5 +++- src/backends/lr.c | 61 ++++++++++++++++++++++++++++++++++++++++++--- src/backends/lr.h | 3 +++ src/t_parser.c | 3 +++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index ec7a5b5..975735a 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -346,7 +346,10 @@ void h_lalr_free(HParser *parser) HParserBackendVTable h__lalr_backend_vtable = { .compile = h_lalr_compile, .parse = h_lr_parse, - .free = h_lalr_free + .free = h_lalr_free, + .parse_start = h_lr_parse_start, + .parse_chunk = h_lr_parse_chunk, + .parse_finish = h_lr_parse_finish }; diff --git a/src/backends/lr.c b/src/backends/lr.c index 161bbf9..3f99eb5 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -199,15 +199,14 @@ bool h_lrtable_row_empty(const HLRTable *table, size_t i) /* LR driver */ -HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, - const HInputStream *stream) +static +HLREngine *h_lrengine_new_(HArena *arena, HArena *tarena, const HLRTable *table) { HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); engine->table = table; engine->state = 0; engine->stack = h_slist_new(tarena); - engine->input = *stream; engine->merged[0] = NULL; engine->merged[1] = NULL; engine->arena = arena; @@ -216,6 +215,14 @@ HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, return engine; } +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, + const HInputStream *stream) +{ + HLREngine *engine = h_lrengine_new_(arena, tarena, table); + engine->input = *stream; + return engine; +} + static const HLRAction * terminal_lookup(const HLREngine *engine, const HInputStream *stream) { @@ -352,7 +359,7 @@ HParseResult *h_lrengine_result(HLREngine *engine) assert(!h_slist_empty(engine->stack)); HParsedToken *tok = engine->stack->head->elem; HParseResult *res = make_result(engine->arena, tok); - res->bit_length = engine->input.index * 8; + res->bit_length = (engine->input.pos + engine->input.index) * 8; return res; } else { return NULL; @@ -379,7 +386,53 @@ HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* return result; } +void h_lr_parse_start(HSuspendedParser *s) +{ + HLRTable *table = s->parser->backend_data; + assert(table != NULL); + HArena *arena = h_new_arena(s->mm__, 0); // will hold the results + HArena *tarena = h_new_arena(s->mm__, 0); // tmp, deleted after parse + HLREngine *engine = h_lrengine_new_(arena, tarena, table); + + s->backend_state = engine; +} + +bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream) +{ + HLREngine *engine = s->backend_state; + engine->input = *stream; + + bool run = true; + while(run) { + // check input against table to determine which action to take + const HLRAction *action = h_lrengine_action(engine); + if(action == NEED_INPUT) { + // XXX assume lookahead 1 + assert(engine->input.length - engine->input.index == 0); + break; + } + + // execute action + run = h_lrengine_step(engine, action); + if(engine->input.overrun && !engine->input.last_chunk) + break; + } + + *stream = engine->input; + return !run; // done if engine no longer running +} + +HParseResult *h_lr_parse_finish(HSuspendedParser *s) +{ + HLREngine *engine = s->backend_state; + + HParseResult *result = h_lrengine_result(engine); + if(!result) + h_delete_arena(engine->arena); + h_delete_arena(engine->tarena); + return result; +} /* Pretty-printers */ diff --git a/src/backends/lr.h b/src/backends/lr.h index f9cb9a2..724d126 100644 --- a/src/backends/lr.h +++ b/src/backends/lr.h @@ -134,6 +134,9 @@ const HLRAction *h_lrengine_action(const HLREngine *engine); bool h_lrengine_step(HLREngine *engine, const HLRAction *action); HParseResult *h_lrengine_result(HLREngine *engine); HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); +void h_lr_parse_start(HSuspendedParser *s); +bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream); +HParseResult *h_lr_parse_finish(HSuspendedParser *s); HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item); diff --git a/src/t_parser.c b/src/t_parser.c index a8bdb0d..3d54ff6 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -883,6 +883,9 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/lalr/leftrec-ne", GINT_TO_POINTER(PB_LALR), test_leftrec_ne); g_test_add_data_func("/core/parser/lalr/rightrec", GINT_TO_POINTER(PB_LALR), test_rightrec); g_test_add_data_func("/core/parser/lalr/result_length", GINT_TO_POINTER(PB_LALR), test_result_length); + g_test_add_data_func("/core/parser/lalr/iterative", GINT_TO_POINTER(PB_LALR), test_iterative); + g_test_add_data_func("/core/parser/lalr/iterative/lookahead", GINT_TO_POINTER(PB_LALR), test_iterative_lookahead); + g_test_add_data_func("/core/parser/lalr/iterative/result_length", GINT_TO_POINTER(PB_LALR), test_iterative_result_length); g_test_add_data_func("/core/parser/glr/token", GINT_TO_POINTER(PB_GLR), test_token); g_test_add_data_func("/core/parser/glr/ch", GINT_TO_POINTER(PB_GLR), test_ch);